nltk-3.1/0000755000076500000240000000000012610001541012042 5ustar sbstaff00000000000000nltk-3.1/INSTALL.txt0000644000076500000240000000027212574600335013731 0ustar sbstaff00000000000000To install NLTK, run setup.py from an administrator account, e.g.: sudo python setup.py install For full installation instructions, please see http://nltk.github.com/install.html nltk-3.1/LICENSE.txt0000644000076500000240000000106112607224144013677 0ustar sbstaff00000000000000Copyright (C) 2001-2015 NLTK Project Licensed under the Apache License, Version 2.0 (the 'License'); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. nltk-3.1/MANIFEST.in0000644000076500000240000000025412574600335013620 0ustar sbstaff00000000000000include LICENSE.txt INSTALL.txt README.txt MANIFEST.in include setup.py include nltk/test/*.doctest include nltk/VERSION recursive-include *.txt Makefile global-exclude *~ nltk-3.1/nltk/0000755000076500000240000000000012610001541013012 5ustar sbstaff00000000000000nltk-3.1/nltk/__init__.py0000644000076500000240000001371512607224144015146 0ustar sbstaff00000000000000# Natural Language Toolkit (NLTK) # # Copyright (C) 2001-2015 NLTK Project # Authors: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ The Natural Language Toolkit (NLTK) is an open source Python library for Natural Language Processing. A free online book is available. (If you use the library for academic research, please cite the book.) Steven Bird, Ewan Klein, and Edward Loper (2009). Natural Language Processing with Python. O'Reilly Media Inc. http://nltk.org/book """ from __future__ import print_function, absolute_import import os # ////////////////////////////////////////////////////// # Metadata # ////////////////////////////////////////////////////// # Version. For each new release, the version number should be updated # in the file VERSION. try: # If a VERSION file exists, use it! version_file = os.path.join(os.path.dirname(__file__), 'VERSION') with open(version_file, 'r') as infile: __version__ = infile.read().strip() except NameError: __version__ = 'unknown (running code interactively?)' except IOError as ex: __version__ = "unknown (%s)" % ex if __doc__ is not None: # fix for the ``python -OO`` __doc__ += '\n@version: ' + __version__ # Copyright notice __copyright__ = """\ Copyright (C) 2001-2015 NLTK Project. Distributed and Licensed under the Apache License, Version 2.0, which is included by reference. """ __license__ = "Apache License, Version 2.0" # Description of the toolkit, keywords, and the project's primary URL. __longdescr__ = """\ The Natural Language Toolkit (NLTK) is a Python package for natural language processing. NLTK requires Python 2.6 or higher.""" __keywords__ = ['NLP', 'CL', 'natural language processing', 'computational linguistics', 'parsing', 'tagging', 'tokenizing', 'syntax', 'linguistics', 'language', 'natural language', 'text analytics'] __url__ = "http://nltk.org/" # Maintainer, contributors, etc. __maintainer__ = "Steven Bird, Edward Loper, Ewan Klein" __maintainer_email__ = "stevenbird1@gmail.com" __author__ = __maintainer__ __author_email__ = __maintainer_email__ # "Trove" classifiers for Python Package Index. __classifiers__ = [ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'Intended Audience :: Education', 'Intended Audience :: Information Technology', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Scientific/Engineering :: Human Machine Interfaces', 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: Text Processing', 'Topic :: Text Processing :: Filters', 'Topic :: Text Processing :: General', 'Topic :: Text Processing :: Indexing', 'Topic :: Text Processing :: Linguistic', ] from nltk.internals import config_java # support numpy from pypy try: import numpypy except ImportError: pass # Override missing methods on environments where it cannot be used like GAE. import subprocess if not hasattr(subprocess, 'PIPE'): def _fake_PIPE(*args, **kwargs): raise NotImplementedError('subprocess.PIPE is not supported.') subprocess.PIPE = _fake_PIPE if not hasattr(subprocess, 'Popen'): def _fake_Popen(*args, **kwargs): raise NotImplementedError('subprocess.Popen is not supported.') subprocess.Popen = _fake_Popen ########################################################### # TOP-LEVEL MODULES ########################################################### # Import top-level functionality into top-level namespace from nltk.collocations import * from nltk.decorators import decorator, memoize from nltk.featstruct import * from nltk.grammar import * from nltk.probability import * from nltk.text import * from nltk.tree import * from nltk.util import * from nltk.jsontags import * ########################################################### # PACKAGES ########################################################### from nltk.chunk import * from nltk.classify import * from nltk.inference import * from nltk.metrics import * from nltk.parse import * from nltk.tag import * from nltk.tokenize import * from nltk.translate import * from nltk.sem import * from nltk.stem import * # Packages which can be lazily imported # (a) we don't import * # (b) they're slow to import or have run-time dependencies # that can safely fail at run time from nltk import lazyimport app = lazyimport.LazyModule('nltk.app', locals(), globals()) chat = lazyimport.LazyModule('nltk.chat', locals(), globals()) corpus = lazyimport.LazyModule('nltk.corpus', locals(), globals()) draw = lazyimport.LazyModule('nltk.draw', locals(), globals()) toolbox = lazyimport.LazyModule('nltk.toolbox', locals(), globals()) # Optional loading try: import numpy except ImportError: pass else: from nltk import cluster from nltk.downloader import download, download_shell try: import tkinter except ImportError: pass else: try: from nltk.downloader import download_gui except RuntimeError as e: import warnings warnings.warn("Corpus downloader GUI not loaded " "(RuntimeError during import: %s)" % str(e)) # explicitly import all top-level modules (ensuring # they override the same names inadvertently imported # from a subpackage) from nltk import ccg, chunk, classify, collocations from nltk import data, featstruct, grammar, help, inference, metrics from nltk import misc, parse, probability, sem, stem, wsd from nltk import tag, tbl, text, tokenize, translate, tree, treetransforms, util # override any accidentally imported demo def demo(): print("To run the demo code for a module, type nltk.module.demo()") nltk-3.1/nltk/app/0000755000076500000240000000000012610001541013572 5ustar sbstaff00000000000000nltk-3.1/nltk/app/__init__.py0000644000076500000240000000331112607224144015715 0ustar sbstaff00000000000000# Natural Language Toolkit: Applications package # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT """ Interactive NLTK Applications: chartparser: Chart Parser chunkparser: Regular-Expression Chunk Parser collocations: Find collocations in text concordance: Part-of-speech concordancer nemo: Finding (and Replacing) Nemo regular expression tool rdparser: Recursive Descent Parser srparser: Shift-Reduce Parser wordnet: WordNet Browser """ # Import Tkinter-based modules if Tkinter is installed import nltk.compat try: import tkinter except ImportError: import warnings warnings.warn("nltk.app package not loaded " "(please install Tkinter library).") else: from nltk.app.chartparser_app import app as chartparser from nltk.app.chunkparser_app import app as chunkparser from nltk.app.collocations_app import app as collocations from nltk.app.concordance_app import app as concordance from nltk.app.nemo_app import app as nemo from nltk.app.rdparser_app import app as rdparser from nltk.app.srparser_app import app as srparser from nltk.app.wordnet_app import app as wordnet try: from matplotlib import pylab except ImportError: import warnings warnings.warn("nltk.app.wordfreq not loaded " "(requires the matplotlib library).") else: from nltk.app.wordfreq_app import app as wordfreq # skip doctests from this package def setup_module(module): from nose import SkipTest raise SkipTest("nltk.app examples are not doctests") nltk-3.1/nltk/app/chartparser_app.py0000644000076500000240000025007512607224144017347 0ustar sbstaff00000000000000# Natural Language Toolkit: Chart Parser Application # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Jean Mark Gawron # Steven Bird # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring chart parsing. Chart parsing is a flexible parsing algorithm that uses a data structure called a "chart" to record hypotheses about syntactic constituents. Each hypothesis is represented by a single "edge" on the chart. A set of "chart rules" determine when new edges can be added to the chart. This set of rules controls the overall behavior of the parser (e.g. whether it parses top-down or bottom-up). The chart parsing tool demonstrates the process of parsing a single sentence, with a given grammar and lexicon. Its display is divided into three sections: the bottom section displays the chart; the middle section displays the sentence; and the top section displays the partial syntax tree corresponding to the selected edge. Buttons along the bottom of the window are used to control the execution of the algorithm. The chart parsing tool allows for flexible control of the parsing algorithm. At each step of the algorithm, you can select which rule or strategy you wish to apply. This allows you to experiment with mixing different strategies (e.g. top-down and bottom-up). You can exercise fine-grained control over the algorithm by selecting which edge you wish to apply a rule to. """ # At some point, we should rewrite this tool to use the new canvas # widget system. import nltk.compat import pickle from tkinter.filedialog import asksaveasfilename, askopenfilename import tkinter import math import os.path import tkinter.font, tkinter.messagebox from nltk.parse.chart import (BottomUpPredictCombineRule, BottomUpPredictRule, Chart, LeafEdge, LeafInitRule, SingleEdgeFundamentalRule, SteppingChartParser, TopDownInitRule, TopDownPredictRule, TreeEdge) from nltk.tree import Tree from nltk.grammar import Nonterminal, CFG from nltk.util import in_idle from nltk.draw.util import (CanvasFrame, ColorizedList, EntryDialog, MutableOptionMenu, ShowText, SymbolWidget) from nltk.draw import CFGEditor, tree_to_treesegment, TreeSegmentWidget # Known bug: ChartView doesn't handle edges generated by epsilon # productions (e.g., [Production: PP -> ]) very well. ####################################################################### # Edge List ####################################################################### class EdgeList(ColorizedList): ARROW = SymbolWidget.SYMBOLS['rightarrow'] def _init_colortags(self, textwidget, options): textwidget.tag_config('terminal', foreground='#006000') textwidget.tag_config('arrow', font='symbol', underline='0') textwidget.tag_config('dot', foreground = '#000000') textwidget.tag_config('nonterminal', foreground='blue', font=('helvetica', -12, 'bold')) def _item_repr(self, item): contents = [] contents.append(('%s\t' % item.lhs(), 'nonterminal')) contents.append((self.ARROW, 'arrow')) for i, elt in enumerate(item.rhs()): if i == item.dot(): contents.append((' *', 'dot')) if isinstance(elt, Nonterminal): contents.append((' %s' % elt.symbol(), 'nonterminal')) else: contents.append((' %r' % elt, 'terminal')) if item.is_complete(): contents.append((' *', 'dot')) return contents ####################################################################### # Chart Matrix View ####################################################################### class ChartMatrixView(object): """ A view of a chart that displays the contents of the corresponding matrix. """ def __init__(self, parent, chart, toplevel=True, title='Chart Matrix', show_numedges=False): self._chart = chart self._cells = [] self._marks = [] self._selected_cell = None if toplevel: self._root = tkinter.Toplevel(parent) self._root.title(title) self._root.bind('', self.destroy) self._init_quit(self._root) else: self._root = tkinter.Frame(parent) self._init_matrix(self._root) self._init_list(self._root) if show_numedges: self._init_numedges(self._root) else: self._numedges_label = None self._callbacks = {} self._num_edges = 0 self.draw() def _init_quit(self, root): quit = tkinter.Button(root, text='Quit', command=self.destroy) quit.pack(side='bottom', expand=0, fill='none') def _init_matrix(self, root): cframe = tkinter.Frame(root, border=2, relief='sunken') cframe.pack(expand=0, fill='none', padx=1, pady=3, side='top') self._canvas = tkinter.Canvas(cframe, width=200, height=200, background='white') self._canvas.pack(expand=0, fill='none') def _init_numedges(self, root): self._numedges_label = tkinter.Label(root, text='0 edges') self._numedges_label.pack(expand=0, fill='none', side='top') def _init_list(self, root): self._list = EdgeList(root, [], width=20, height=5) self._list.pack(side='top', expand=1, fill='both', pady=3) def cb(edge, self=self): self._fire_callbacks('select', edge) self._list.add_callback('select', cb) self._list.focus() def destroy(self, *e): if self._root is None: return try: self._root.destroy() except: pass self._root = None def set_chart(self, chart): if chart is not self._chart: self._chart = chart self._num_edges = 0 self.draw() def update(self): if self._root is None: return # Count the edges in each cell N = len(self._cells) cell_edges = [[0 for i in range(N)] for j in range(N)] for edge in self._chart: cell_edges[edge.start()][edge.end()] += 1 # Color the cells correspondingly. for i in range(N): for j in range(i, N): if cell_edges[i][j] == 0: color = 'gray20' else: color = ('#00%02x%02x' % (min(255, 50+128*cell_edges[i][j]/10), max(0, 128-128*cell_edges[i][j]/10))) cell_tag = self._cells[i][j] self._canvas.itemconfig(cell_tag, fill=color) if (i,j) == self._selected_cell: self._canvas.itemconfig(cell_tag, outline='#00ffff', width=3) self._canvas.tag_raise(cell_tag) else: self._canvas.itemconfig(cell_tag, outline='black', width=1) # Update the edge list. edges = list(self._chart.select(span=self._selected_cell)) self._list.set(edges) # Update our edge count. self._num_edges = self._chart.num_edges() if self._numedges_label is not None: self._numedges_label['text'] = '%d edges' % self._num_edges def activate(self): self._canvas.itemconfig('inactivebox', state='hidden') self.update() def inactivate(self): self._canvas.itemconfig('inactivebox', state='normal') self.update() def add_callback(self, event, func): self._callbacks.setdefault(event,{})[func] = 1 def remove_callback(self, event, func=None): if func is None: del self._callbacks[event] else: try: del self._callbacks[event][func] except: pass def _fire_callbacks(self, event, *args): if event not in self._callbacks: return for cb_func in list(self._callbacks[event].keys()): cb_func(*args) def select_cell(self, i, j): if self._root is None: return # If the cell is already selected (and the chart contents # haven't changed), then do nothing. if ((i,j) == self._selected_cell and self._chart.num_edges() == self._num_edges): return self._selected_cell = (i,j) self.update() # Fire the callback. self._fire_callbacks('select_cell', i, j) def deselect_cell(self): if self._root is None: return self._selected_cell = None self._list.set([]) self.update() def _click_cell(self, i, j): if self._selected_cell == (i,j): self.deselect_cell() else: self.select_cell(i, j) def view_edge(self, edge): self.select_cell(*edge.span()) self._list.view(edge) def mark_edge(self, edge): if self._root is None: return self.select_cell(*edge.span()) self._list.mark(edge) def unmark_edge(self, edge=None): if self._root is None: return self._list.unmark(edge) def markonly_edge(self, edge): if self._root is None: return self.select_cell(*edge.span()) self._list.markonly(edge) def draw(self): if self._root is None: return LEFT_MARGIN = BOT_MARGIN = 15 TOP_MARGIN = 5 c = self._canvas c.delete('all') N = self._chart.num_leaves()+1 dx = (int(c['width'])-LEFT_MARGIN)/N dy = (int(c['height'])-TOP_MARGIN-BOT_MARGIN)/N c.delete('all') # Labels and dotted lines for i in range(N): c.create_text(LEFT_MARGIN-2, i*dy+dy/2+TOP_MARGIN, text=repr(i), anchor='e') c.create_text(i*dx+dx/2+LEFT_MARGIN, N*dy+TOP_MARGIN+1, text=repr(i), anchor='n') c.create_line(LEFT_MARGIN, dy*(i+1)+TOP_MARGIN, dx*N+LEFT_MARGIN, dy*(i+1)+TOP_MARGIN, dash='.') c.create_line(dx*i+LEFT_MARGIN, TOP_MARGIN, dx*i+LEFT_MARGIN, dy*N+TOP_MARGIN, dash='.') # A box around the whole thing c.create_rectangle(LEFT_MARGIN, TOP_MARGIN, LEFT_MARGIN+dx*N, dy*N+TOP_MARGIN, width=2) # Cells self._cells = [[None for i in range(N)] for j in range(N)] for i in range(N): for j in range(i, N): t = c.create_rectangle(j*dx+LEFT_MARGIN, i*dy+TOP_MARGIN, (j+1)*dx+LEFT_MARGIN, (i+1)*dy+TOP_MARGIN, fill='gray20') self._cells[i][j] = t def cb(event, self=self, i=i, j=j): self._click_cell(i,j) c.tag_bind(t, '', cb) # Inactive box xmax, ymax = int(c['width']), int(c['height']) t = c.create_rectangle(-100, -100, xmax+100, ymax+100, fill='gray50', state='hidden', tag='inactivebox') c.tag_lower(t) # Update the cells. self.update() def pack(self, *args, **kwargs): self._root.pack(*args, **kwargs) ####################################################################### # Chart Results View ####################################################################### class ChartResultsView(object): def __init__(self, parent, chart, grammar, toplevel=True): self._chart = chart self._grammar = grammar self._trees = [] self._y = 10 self._treewidgets = [] self._selection = None self._selectbox = None if toplevel: self._root = tkinter.Toplevel(parent) self._root.title('Chart Parser Application: Results') self._root.bind('', self.destroy) else: self._root = tkinter.Frame(parent) # Buttons if toplevel: buttons = tkinter.Frame(self._root) buttons.pack(side='bottom', expand=0, fill='x') tkinter.Button(buttons, text='Quit', command=self.destroy).pack(side='right') tkinter.Button(buttons, text='Print All', command=self.print_all).pack(side='left') tkinter.Button(buttons, text='Print Selection', command=self.print_selection).pack(side='left') # Canvas frame. self._cframe = CanvasFrame(self._root, closeenough=20) self._cframe.pack(side='top', expand=1, fill='both') # Initial update self.update() def update(self, edge=None): if self._root is None: return # If the edge isn't a parse edge, do nothing. if edge is not None: if edge.lhs() != self._grammar.start(): return if edge.span() != (0, self._chart.num_leaves()): return for parse in self._chart.parses(self._grammar.start()): if parse not in self._trees: self._add(parse) def _add(self, parse): # Add it to self._trees. self._trees.append(parse) # Create a widget for it. c = self._cframe.canvas() treewidget = tree_to_treesegment(c, parse) # Add it to the canvas frame. self._treewidgets.append(treewidget) self._cframe.add_widget(treewidget, 10, self._y) # Register callbacks. treewidget.bind_click(self._click) # Update y. self._y = treewidget.bbox()[3] + 10 def _click(self, widget): c = self._cframe.canvas() if self._selection is not None: c.delete(self._selectbox) self._selection = widget (x1, y1, x2, y2) = widget.bbox() self._selectbox = c.create_rectangle(x1, y1, x2, y2, width=2, outline='#088') def _color(self, treewidget, color): treewidget.label()['color'] = color for child in treewidget.subtrees(): if isinstance(child, TreeSegmentWidget): self._color(child, color) else: child['color'] = color def print_all(self, *e): if self._root is None: return self._cframe.print_to_file() def print_selection(self, *e): if self._root is None: return if self._selection is None: tkinter.messagebox.showerror('Print Error', 'No tree selected') else: c = self._cframe.canvas() for widget in self._treewidgets: if widget is not self._selection: self._cframe.destroy_widget(widget) c.delete(self._selectbox) (x1,y1,x2,y2) = self._selection.bbox() self._selection.move(10-x1,10-y1) c['scrollregion'] = '0 0 %s %s' % (x2-x1+20, y2-y1+20) self._cframe.print_to_file() # Restore our state. self._treewidgets = [self._selection] self.clear() self.update() def clear(self): if self._root is None: return for treewidget in self._treewidgets: self._cframe.destroy_widget(treewidget) self._trees = [] self._treewidgets = [] if self._selection is not None: self._cframe.canvas().delete(self._selectbox) self._selection = None self._y = 10 def set_chart(self, chart): self.clear() self._chart = chart self.update() def set_grammar(self, grammar): self.clear() self._grammar = grammar self.update() def destroy(self, *e): if self._root is None: return try: self._root.destroy() except: pass self._root = None def pack(self, *args, **kwargs): self._root.pack(*args, **kwargs) ####################################################################### # Chart Comparer ####################################################################### class ChartComparer(object): """ :ivar _root: The root window :ivar _charts: A dictionary mapping names to charts. When charts are loaded, they are added to this dictionary. :ivar _left_chart: The left ``Chart``. :ivar _left_name: The name ``_left_chart`` (derived from filename) :ivar _left_matrix: The ``ChartMatrixView`` for ``_left_chart`` :ivar _left_selector: The drop-down ``MutableOptionsMenu`` used to select ``_left_chart``. :ivar _right_chart: The right ``Chart``. :ivar _right_name: The name ``_right_chart`` (derived from filename) :ivar _right_matrix: The ``ChartMatrixView`` for ``_right_chart`` :ivar _right_selector: The drop-down ``MutableOptionsMenu`` used to select ``_right_chart``. :ivar _out_chart: The out ``Chart``. :ivar _out_name: The name ``_out_chart`` (derived from filename) :ivar _out_matrix: The ``ChartMatrixView`` for ``_out_chart`` :ivar _out_label: The label for ``_out_chart``. :ivar _op_label: A Label containing the most recent operation. """ _OPSYMBOL = {'-': '-', 'and': SymbolWidget.SYMBOLS['intersection'], 'or': SymbolWidget.SYMBOLS['union']} def __init__(self, *chart_filenames): # This chart is displayed when we don't have a value (eg # before any chart is loaded). faketok = [''] * 8 self._emptychart = Chart(faketok) # The left & right charts start out empty. self._left_name = 'None' self._right_name = 'None' self._left_chart = self._emptychart self._right_chart = self._emptychart # The charts that have been loaded. self._charts = {'None': self._emptychart} # The output chart. self._out_chart = self._emptychart # The most recent operation self._operator = None # Set up the root window. self._root = tkinter.Tk() self._root.title('Chart Comparison') self._root.bind('', self.destroy) self._root.bind('', self.destroy) # Initialize all widgets, etc. self._init_menubar(self._root) self._init_chartviews(self._root) self._init_divider(self._root) self._init_buttons(self._root) self._init_bindings(self._root) # Load any specified charts. for filename in chart_filenames: self.load_chart(filename) def destroy(self, *e): if self._root is None: return try: self._root.destroy() except: pass self._root = None def mainloop(self, *args, **kwargs): return self._root.mainloop(*args, **kwargs) #//////////////////////////////////////////////////////////// # Initialization #//////////////////////////////////////////////////////////// def _init_menubar(self, root): menubar = tkinter.Menu(root) # File menu filemenu = tkinter.Menu(menubar, tearoff=0) filemenu.add_command(label='Load Chart', accelerator='Ctrl-o', underline=0, command=self.load_chart_dialog) filemenu.add_command(label='Save Output', accelerator='Ctrl-s', underline=0, command=self.save_chart_dialog) filemenu.add_separator() filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x') menubar.add_cascade(label='File', underline=0, menu=filemenu) # Compare menu opmenu = tkinter.Menu(menubar, tearoff=0) opmenu.add_command(label='Intersection', command=self._intersection, accelerator='+') opmenu.add_command(label='Union', command=self._union, accelerator='*') opmenu.add_command(label='Difference', command=self._difference, accelerator='-') opmenu.add_separator() opmenu.add_command(label='Swap Charts', command=self._swapcharts) menubar.add_cascade(label='Compare', underline=0, menu=opmenu) # Add the menu self._root.config(menu=menubar) def _init_divider(self, root): divider = tkinter.Frame(root, border=2, relief='sunken') divider.pack(side='top', fill='x', ipady=2) def _init_chartviews(self, root): opfont=('symbol', -36) # Font for operator. eqfont=('helvetica', -36) # Font for equals sign. frame = tkinter.Frame(root, background='#c0c0c0') frame.pack(side='top', expand=1, fill='both') # The left matrix. cv1_frame = tkinter.Frame(frame, border=3, relief='groove') cv1_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both') self._left_selector = MutableOptionMenu( cv1_frame, list(self._charts.keys()), command=self._select_left) self._left_selector.pack(side='top', pady=5, fill='x') self._left_matrix = ChartMatrixView(cv1_frame, self._emptychart, toplevel=False, show_numedges=True) self._left_matrix.pack(side='bottom', padx=5, pady=5, expand=1, fill='both') self._left_matrix.add_callback('select', self.select_edge) self._left_matrix.add_callback('select_cell', self.select_cell) self._left_matrix.inactivate() # The operator. self._op_label = tkinter.Label(frame, text=' ', width=3, background='#c0c0c0', font=opfont) self._op_label.pack(side='left', padx=5, pady=5) # The right matrix. cv2_frame = tkinter.Frame(frame, border=3, relief='groove') cv2_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both') self._right_selector = MutableOptionMenu( cv2_frame, list(self._charts.keys()), command=self._select_right) self._right_selector.pack(side='top', pady=5, fill='x') self._right_matrix = ChartMatrixView(cv2_frame, self._emptychart, toplevel=False, show_numedges=True) self._right_matrix.pack(side='bottom', padx=5, pady=5, expand=1, fill='both') self._right_matrix.add_callback('select', self.select_edge) self._right_matrix.add_callback('select_cell', self.select_cell) self._right_matrix.inactivate() # The equals sign tkinter.Label(frame, text='=', width=3, background='#c0c0c0', font=eqfont).pack(side='left', padx=5, pady=5) # The output matrix. out_frame = tkinter.Frame(frame, border=3, relief='groove') out_frame.pack(side='left', padx=8, pady=7, expand=1, fill='both') self._out_label = tkinter.Label(out_frame, text='Output') self._out_label.pack(side='top', pady=9) self._out_matrix = ChartMatrixView(out_frame, self._emptychart, toplevel=False, show_numedges=True) self._out_matrix.pack(side='bottom', padx=5, pady=5, expand=1, fill='both') self._out_matrix.add_callback('select', self.select_edge) self._out_matrix.add_callback('select_cell', self.select_cell) self._out_matrix.inactivate() def _init_buttons(self, root): buttons = tkinter.Frame(root) buttons.pack(side='bottom', pady=5, fill='x', expand=0) tkinter.Button(buttons, text='Intersection', command=self._intersection).pack(side='left') tkinter.Button(buttons, text='Union', command=self._union).pack(side='left') tkinter.Button(buttons, text='Difference', command=self._difference).pack(side='left') tkinter.Frame(buttons, width=20).pack(side='left') tkinter.Button(buttons, text='Swap Charts', command=self._swapcharts).pack(side='left') tkinter.Button(buttons, text='Detatch Output', command=self._detatch_out).pack(side='right') def _init_bindings(self, root): #root.bind('', self.save_chart) root.bind('', self.load_chart_dialog) #root.bind('', self.reset) #//////////////////////////////////////////////////////////// # Input Handling #//////////////////////////////////////////////////////////// def _select_left(self, name): self._left_name = name self._left_chart = self._charts[name] self._left_matrix.set_chart(self._left_chart) if name == 'None': self._left_matrix.inactivate() self._apply_op() def _select_right(self, name): self._right_name = name self._right_chart = self._charts[name] self._right_matrix.set_chart(self._right_chart) if name == 'None': self._right_matrix.inactivate() self._apply_op() def _apply_op(self): if self._operator == '-': self._difference() elif self._operator == 'or': self._union() elif self._operator == 'and': self._intersection() #//////////////////////////////////////////////////////////// # File #//////////////////////////////////////////////////////////// CHART_FILE_TYPES = [('Pickle file', '.pickle'), ('All files', '*')] def save_chart_dialog(self, *args): filename = asksaveasfilename(filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle') if not filename: return try: with open(filename, 'wb') as outfile: pickle.dump(self._out_chart, outfile) except Exception as e: tkinter.messagebox.showerror('Error Saving Chart', 'Unable to open file: %r\n%s' % (filename, e)) def load_chart_dialog(self, *args): filename = askopenfilename(filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle') if not filename: return try: self.load_chart(filename) except Exception as e: tkinter.messagebox.showerror('Error Loading Chart', 'Unable to open file: %r\n%s' % (filename, e)) def load_chart(self, filename): with open(filename, 'rb') as infile: chart = pickle.load(infile) name = os.path.basename(filename) if name.endswith('.pickle'): name = name[:-7] if name.endswith('.chart'): name = name[:-6] self._charts[name] = chart self._left_selector.add(name) self._right_selector.add(name) # If either left_matrix or right_matrix is empty, then # display the new chart. if self._left_chart is self._emptychart: self._left_selector.set(name) elif self._right_chart is self._emptychart: self._right_selector.set(name) def _update_chartviews(self): self._left_matrix.update() self._right_matrix.update() self._out_matrix.update() #//////////////////////////////////////////////////////////// # Selection #//////////////////////////////////////////////////////////// def select_edge(self, edge): if edge in self._left_chart: self._left_matrix.markonly_edge(edge) else: self._left_matrix.unmark_edge() if edge in self._right_chart: self._right_matrix.markonly_edge(edge) else: self._right_matrix.unmark_edge() if edge in self._out_chart: self._out_matrix.markonly_edge(edge) else: self._out_matrix.unmark_edge() def select_cell(self, i, j): self._left_matrix.select_cell(i, j) self._right_matrix.select_cell(i, j) self._out_matrix.select_cell(i, j) #//////////////////////////////////////////////////////////// # Operations #//////////////////////////////////////////////////////////// def _difference(self): if not self._checkcompat(): return out_chart = Chart(self._left_chart.tokens()) for edge in self._left_chart: if edge not in self._right_chart: out_chart.insert(edge, []) self._update('-', out_chart) def _intersection(self): if not self._checkcompat(): return out_chart = Chart(self._left_chart.tokens()) for edge in self._left_chart: if edge in self._right_chart: out_chart.insert(edge, []) self._update('and', out_chart) def _union(self): if not self._checkcompat(): return out_chart = Chart(self._left_chart.tokens()) for edge in self._left_chart: out_chart.insert(edge, []) for edge in self._right_chart: out_chart.insert(edge, []) self._update('or', out_chart) def _swapcharts(self): left, right = self._left_name, self._right_name self._left_selector.set(right) self._right_selector.set(left) def _checkcompat(self): if (self._left_chart.tokens() != self._right_chart.tokens() or self._left_chart.property_names() != self._right_chart.property_names() or self._left_chart == self._emptychart or self._right_chart == self._emptychart): # Clear & inactivate the output chart. self._out_chart = self._emptychart self._out_matrix.set_chart(self._out_chart) self._out_matrix.inactivate() self._out_label['text'] = 'Output' # Issue some other warning? return False else: return True def _update(self, operator, out_chart): self._operator = operator self._op_label['text'] = self._OPSYMBOL[operator] self._out_chart = out_chart self._out_matrix.set_chart(out_chart) self._out_label['text'] = '%s %s %s' % (self._left_name, self._operator, self._right_name) def _clear_out_chart(self): self._out_chart = self._emptychart self._out_matrix.set_chart(self._out_chart) self._op_label['text'] = ' ' self._out_matrix.inactivate() def _detatch_out(self): ChartMatrixView(self._root, self._out_chart, title=self._out_label['text']) ####################################################################### # Chart View ####################################################################### class ChartView(object): """ A component for viewing charts. This is used by ``ChartParserApp`` to allow students to interactively experiment with various chart parsing techniques. It is also used by ``Chart.draw()``. :ivar _chart: The chart that we are giving a view of. This chart may be modified; after it is modified, you should call ``update``. :ivar _sentence: The list of tokens that the chart spans. :ivar _root: The root window. :ivar _chart_canvas: The canvas we're using to display the chart itself. :ivar _tree_canvas: The canvas we're using to display the tree that each edge spans. May be None, if we're not displaying trees. :ivar _sentence_canvas: The canvas we're using to display the sentence text. May be None, if we're not displaying the sentence text. :ivar _edgetags: A dictionary mapping from edges to the tags of the canvas elements (lines, etc) used to display that edge. The values of this dictionary have the form ``(linetag, rhstag1, dottag, rhstag2, lhstag)``. :ivar _treetags: A list of all the tags that make up the tree; used to erase the tree (without erasing the loclines). :ivar _chart_height: The height of the chart canvas. :ivar _sentence_height: The height of the sentence canvas. :ivar _tree_height: The height of the tree :ivar _text_height: The height of a text string (in the normal font). :ivar _edgelevels: A list of edges at each level of the chart (the top level is the 0th element). This list is used to remember where edges should be drawn; and to make sure that no edges are overlapping on the chart view. :ivar _unitsize: Pixel size of one unit (from the location). This is determined by the span of the chart's location, and the width of the chart display canvas. :ivar _fontsize: The current font size :ivar _marks: A dictionary from edges to marks. Marks are strings, specifying colors (e.g. 'green'). """ _LEAF_SPACING = 10 _MARGIN = 10 _TREE_LEVEL_SIZE = 12 _CHART_LEVEL_SIZE = 40 def __init__(self, chart, root=None, **kw): """ Construct a new ``Chart`` display. """ # Process keyword args. draw_tree = kw.get('draw_tree', 0) draw_sentence = kw.get('draw_sentence', 1) self._fontsize = kw.get('fontsize', -12) # The chart! self._chart = chart # Callback functions self._callbacks = {} # Keep track of drawn edges self._edgelevels = [] self._edgetags = {} # Keep track of which edges are marked. self._marks = {} # These are used to keep track of the set of tree tokens # currently displayed in the tree canvas. self._treetoks = [] self._treetoks_edge = None self._treetoks_index = 0 # Keep track of the tags used to draw the tree self._tree_tags = [] # Put multiple edges on each level? self._compact = 0 # If they didn't provide a main window, then set one up. if root is None: top = tkinter.Tk() top.title('Chart View') def destroy1(e, top=top): top.destroy() def destroy2(top=top): top.destroy() top.bind('q', destroy1) b = tkinter.Button(top, text='Done', command=destroy2) b.pack(side='bottom') self._root = top else: self._root = root # Create some fonts. self._init_fonts(root) # Create the chart canvas. (self._chart_sb, self._chart_canvas) = self._sb_canvas(self._root) self._chart_canvas['height'] = 300 self._chart_canvas['closeenough'] = 15 # Create the sentence canvas. if draw_sentence: cframe = tkinter.Frame(self._root, relief='sunk', border=2) cframe.pack(fill='both', side='bottom') self._sentence_canvas = tkinter.Canvas(cframe, height=50) self._sentence_canvas['background'] = '#e0e0e0' self._sentence_canvas.pack(fill='both') #self._sentence_canvas['height'] = self._sentence_height else: self._sentence_canvas = None # Create the tree canvas. if draw_tree: (sb, canvas) = self._sb_canvas(self._root, 'n', 'x') (self._tree_sb, self._tree_canvas) = (sb, canvas) self._tree_canvas['height'] = 200 else: self._tree_canvas = None # Do some analysis to figure out how big the window should be self._analyze() self.draw() self._resize() self._grow() # Set up the configure callback, which will be called whenever # the window is resized. self._chart_canvas.bind('', self._configure) def _init_fonts(self, root): self._boldfont = tkinter.font.Font(family='helvetica', weight='bold', size=self._fontsize) self._font = tkinter.font.Font(family='helvetica', size=self._fontsize) # See: self._sysfont = tkinter.font.Font(font=tkinter.Button()["font"]) root.option_add("*Font", self._sysfont) def _sb_canvas(self, root, expand='y', fill='both', side='bottom'): """ Helper for __init__: construct a canvas with a scrollbar. """ cframe =tkinter.Frame(root, relief='sunk', border=2) cframe.pack(fill=fill, expand=expand, side=side) canvas = tkinter.Canvas(cframe, background='#e0e0e0') # Give the canvas a scrollbar. sb = tkinter.Scrollbar(cframe, orient='vertical') sb.pack(side='right', fill='y') canvas.pack(side='left', fill=fill, expand='yes') # Connect the scrollbars to the canvas. sb['command']= canvas.yview canvas['yscrollcommand'] = sb.set return (sb, canvas) def scroll_up(self, *e): self._chart_canvas.yview('scroll', -1, 'units') def scroll_down(self, *e): self._chart_canvas.yview('scroll', 1, 'units') def page_up(self, *e): self._chart_canvas.yview('scroll', -1, 'pages') def page_down(self, *e): self._chart_canvas.yview('scroll', 1, 'pages') def _grow(self): """ Grow the window, if necessary """ # Grow, if need-be N = self._chart.num_leaves() width = max(int(self._chart_canvas['width']), N * self._unitsize + ChartView._MARGIN * 2 ) # It won't resize without the second (height) line, but I # don't understand why not. self._chart_canvas.configure(width=width) self._chart_canvas.configure(height=self._chart_canvas['height']) self._unitsize = (width - 2*ChartView._MARGIN) / N # Reset the height for the sentence window. if self._sentence_canvas is not None: self._sentence_canvas['height'] = self._sentence_height def set_font_size(self, size): self._font.configure(size=-abs(size)) self._boldfont.configure(size=-abs(size)) self._sysfont.configure(size=-abs(size)) self._analyze() self._grow() self.draw() def get_font_size(self): return abs(self._fontsize) def _configure(self, e): """ The configure callback. This is called whenever the window is resized. It is also called when the window is first mapped. It figures out the unit size, and redraws the contents of each canvas. """ N = self._chart.num_leaves() self._unitsize = (e.width - 2*ChartView._MARGIN) / N self.draw() def update(self, chart=None): """ Draw any edges that have not been drawn. This is typically called when a after modifies the canvas that a CanvasView is displaying. ``update`` will cause any edges that have been added to the chart to be drawn. If update is given a ``chart`` argument, then it will replace the current chart with the given chart. """ if chart is not None: self._chart = chart self._edgelevels = [] self._marks = {} self._analyze() self._grow() self.draw() self.erase_tree() self._resize() else: for edge in self._chart: if edge not in self._edgetags: self._add_edge(edge) self._resize() def _edge_conflict(self, edge, lvl): """ Return True if the given edge overlaps with any edge on the given level. This is used by _add_edge to figure out what level a new edge should be added to. """ (s1, e1) = edge.span() for otheredge in self._edgelevels[lvl]: (s2, e2) = otheredge.span() if (s1 <= s2 < e1) or (s2 <= s1 < e2) or (s1==s2==e1==e2): return True return False def _analyze_edge(self, edge): """ Given a new edge, recalculate: - _text_height - _unitsize (if the edge text is too big for the current _unitsize, then increase _unitsize) """ c = self._chart_canvas if isinstance(edge, TreeEdge): lhs = edge.lhs() rhselts = [] for elt in edge.rhs(): if isinstance(elt, Nonterminal): rhselts.append(str(elt.symbol())) else: rhselts.append(repr(elt)) rhs = " ".join(rhselts) else: lhs = edge.lhs() rhs = '' for s in (lhs, rhs): tag = c.create_text(0,0, text=s, font=self._boldfont, anchor='nw', justify='left') bbox = c.bbox(tag) c.delete(tag) width = bbox[2] #+ ChartView._LEAF_SPACING edgelen = max(edge.length(), 1) self._unitsize = max(self._unitsize, width/edgelen) self._text_height = max(self._text_height, bbox[3] - bbox[1]) def _add_edge(self, edge, minlvl=0): """ Add a single edge to the ChartView: - Call analyze_edge to recalculate display parameters - Find an available level - Call _draw_edge """ # Do NOT show leaf edges in the chart. if isinstance(edge, LeafEdge): return if edge in self._edgetags: return self._analyze_edge(edge) self._grow() if not self._compact: self._edgelevels.append([edge]) lvl = len(self._edgelevels)-1 self._draw_edge(edge, lvl) self._resize() return # Figure out what level to draw the edge on. lvl = 0 while True: # If this level doesn't exist yet, create it. while lvl >= len(self._edgelevels): self._edgelevels.append([]) self._resize() # Check if we can fit the edge in this level. if lvl>=minlvl and not self._edge_conflict(edge, lvl): # Go ahead and draw it. self._edgelevels[lvl].append(edge) break # Try the next level. lvl += 1 self._draw_edge(edge, lvl) def view_edge(self, edge): level = None for i in range(len(self._edgelevels)): if edge in self._edgelevels[i]: level = i break if level is None: return # Try to view the new edge.. y = (level+1) * self._chart_level_size dy = self._text_height + 10 self._chart_canvas.yview('moveto', 1.0) if self._chart_height != 0: self._chart_canvas.yview('moveto', float(y-dy)/self._chart_height) def _draw_edge(self, edge, lvl): """ Draw a single edge on the ChartView. """ c = self._chart_canvas # Draw the arrow. x1 = (edge.start() * self._unitsize + ChartView._MARGIN) x2 = (edge.end() * self._unitsize + ChartView._MARGIN) if x2 == x1: x2 += max(4, self._unitsize/5) y = (lvl+1) * self._chart_level_size linetag = c.create_line(x1, y, x2, y, arrow='last', width=3) # Draw a label for the edge. if isinstance(edge, TreeEdge): rhs = [] for elt in edge.rhs(): if isinstance(elt, Nonterminal): rhs.append(str(elt.symbol())) else: rhs.append(repr(elt)) pos = edge.dot() else: rhs = [] pos = 0 rhs1 = " ".join(rhs[:pos]) rhs2 = " ".join(rhs[pos:]) rhstag1 = c.create_text(x1+3, y, text=rhs1, font=self._font, anchor='nw') dotx = c.bbox(rhstag1)[2] + 6 doty = (c.bbox(rhstag1)[1]+c.bbox(rhstag1)[3])/2 dottag = c.create_oval(dotx-2, doty-2, dotx+2, doty+2) rhstag2 = c.create_text(dotx+6, y, text=rhs2, font=self._font, anchor='nw') lhstag = c.create_text((x1+x2)/2, y, text=str(edge.lhs()), anchor='s', font=self._boldfont) # Keep track of the edge's tags. self._edgetags[edge] = (linetag, rhstag1, dottag, rhstag2, lhstag) # Register a callback for clicking on the edge. def cb(event, self=self, edge=edge): self._fire_callbacks('select', edge) c.tag_bind(rhstag1, '', cb) c.tag_bind(rhstag2, '', cb) c.tag_bind(linetag, '', cb) c.tag_bind(dottag, '', cb) c.tag_bind(lhstag, '', cb) self._color_edge(edge) def _color_edge(self, edge, linecolor=None, textcolor=None): """ Color in an edge with the given colors. If no colors are specified, use intelligent defaults (dependent on selection, etc.) """ if edge not in self._edgetags: return c = self._chart_canvas if linecolor is not None and textcolor is not None: if edge in self._marks: linecolor = self._marks[edge] tags = self._edgetags[edge] c.itemconfig(tags[0], fill=linecolor) c.itemconfig(tags[1], fill=textcolor) c.itemconfig(tags[2], fill=textcolor, outline=textcolor) c.itemconfig(tags[3], fill=textcolor) c.itemconfig(tags[4], fill=textcolor) return else: N = self._chart.num_leaves() if edge in self._marks: self._color_edge(self._marks[edge]) if (edge.is_complete() and edge.span() == (0, N)): self._color_edge(edge, '#084', '#042') elif isinstance(edge, LeafEdge): self._color_edge(edge, '#48c', '#246') else: self._color_edge(edge, '#00f', '#008') def mark_edge(self, edge, mark='#0df'): """ Mark an edge """ self._marks[edge] = mark self._color_edge(edge) def unmark_edge(self, edge=None): """ Unmark an edge (or all edges) """ if edge is None: old_marked_edges = list(self._marks.keys()) self._marks = {} for edge in old_marked_edges: self._color_edge(edge) else: del self._marks[edge] self._color_edge(edge) def markonly_edge(self, edge, mark='#0df'): self.unmark_edge() self.mark_edge(edge, mark) def _analyze(self): """ Analyze the sentence string, to figure out how big a unit needs to be, How big the tree should be, etc. """ # Figure out the text height and the unit size. unitsize = 70 # min unitsize text_height = 0 c = self._chart_canvas # Check against all tokens for leaf in self._chart.leaves(): tag = c.create_text(0,0, text=repr(leaf), font=self._font, anchor='nw', justify='left') bbox = c.bbox(tag) c.delete(tag) width = bbox[2] + ChartView._LEAF_SPACING unitsize = max(width, unitsize) text_height = max(text_height, bbox[3] - bbox[1]) self._unitsize = unitsize self._text_height = text_height self._sentence_height = (self._text_height + 2*ChartView._MARGIN) # Check against edges. for edge in self._chart.edges(): self._analyze_edge(edge) # Size of chart levels self._chart_level_size = self._text_height * 2 # Default tree size.. self._tree_height = (3 * (ChartView._TREE_LEVEL_SIZE + self._text_height)) # Resize the scrollregions. self._resize() def _resize(self): """ Update the scroll-regions for each canvas. This ensures that everything is within a scroll-region, so the user can use the scrollbars to view the entire display. This does *not* resize the window. """ c = self._chart_canvas # Reset the chart scroll region width = ( self._chart.num_leaves() * self._unitsize + ChartView._MARGIN * 2 ) levels = len(self._edgelevels) self._chart_height = (levels+2)*self._chart_level_size c['scrollregion']=(0,0,width,self._chart_height) # Reset the tree scroll region if self._tree_canvas: self._tree_canvas['scrollregion'] = (0, 0, width, self._tree_height) def _draw_loclines(self): """ Draw location lines. These are vertical gridlines used to show where each location unit is. """ BOTTOM = 50000 c1 = self._tree_canvas c2 = self._sentence_canvas c3 = self._chart_canvas margin = ChartView._MARGIN self._loclines = [] for i in range(0, self._chart.num_leaves()+1): x = i*self._unitsize + margin if c1: t1=c1.create_line(x, 0, x, BOTTOM) c1.tag_lower(t1) if c2: t2=c2.create_line(x, 0, x, self._sentence_height) c2.tag_lower(t2) t3=c3.create_line(x, 0, x, BOTTOM) c3.tag_lower(t3) t4=c3.create_text(x+2, 0, text=repr(i), anchor='nw', font=self._font) c3.tag_lower(t4) #if i % 4 == 0: # if c1: c1.itemconfig(t1, width=2, fill='gray60') # if c2: c2.itemconfig(t2, width=2, fill='gray60') # c3.itemconfig(t3, width=2, fill='gray60') if i % 2 == 0: if c1: c1.itemconfig(t1, fill='gray60') if c2: c2.itemconfig(t2, fill='gray60') c3.itemconfig(t3, fill='gray60') else: if c1: c1.itemconfig(t1, fill='gray80') if c2: c2.itemconfig(t2, fill='gray80') c3.itemconfig(t3, fill='gray80') def _draw_sentence(self): """Draw the sentence string.""" if self._chart.num_leaves() == 0: return c = self._sentence_canvas margin = ChartView._MARGIN y = ChartView._MARGIN for i, leaf in enumerate(self._chart.leaves()): x1 = i * self._unitsize + margin x2 = x1 + self._unitsize x = (x1+x2)/2 tag = c.create_text(x, y, text=repr(leaf), font=self._font, anchor='n', justify='left') bbox = c.bbox(tag) rt=c.create_rectangle(x1+2, bbox[1]-(ChartView._LEAF_SPACING/2), x2-2, bbox[3]+(ChartView._LEAF_SPACING/2), fill='#f0f0f0', outline='#f0f0f0') c.tag_lower(rt) def erase_tree(self): for tag in self._tree_tags: self._tree_canvas.delete(tag) self._treetoks = [] self._treetoks_edge = None self._treetoks_index = 0 def draw_tree(self, edge=None): if edge is None and self._treetoks_edge is None: return if edge is None: edge = self._treetoks_edge # If it's a new edge, then get a new list of treetoks. if self._treetoks_edge != edge: self._treetoks = [t for t in self._chart.trees(edge) if isinstance(t, Tree)] self._treetoks_edge = edge self._treetoks_index = 0 # Make sure there's something to draw. if len(self._treetoks) == 0: return # Erase the old tree. for tag in self._tree_tags: self._tree_canvas.delete(tag) # Draw the new tree. tree = self._treetoks[self._treetoks_index] self._draw_treetok(tree, edge.start()) # Show how many trees are available for the edge. self._draw_treecycle() # Update the scroll region. w = self._chart.num_leaves()*self._unitsize+2*ChartView._MARGIN h = tree.height() * (ChartView._TREE_LEVEL_SIZE+self._text_height) self._tree_canvas['scrollregion'] = (0, 0, w, h) def cycle_tree(self): self._treetoks_index = (self._treetoks_index+1)%len(self._treetoks) self.draw_tree(self._treetoks_edge) def _draw_treecycle(self): if len(self._treetoks) <= 1: return # Draw the label. label = '%d Trees' % len(self._treetoks) c = self._tree_canvas margin = ChartView._MARGIN right = self._chart.num_leaves()*self._unitsize+margin-2 tag = c.create_text(right, 2, anchor='ne', text=label, font=self._boldfont) self._tree_tags.append(tag) _, _, _, y = c.bbox(tag) # Draw the triangles. for i in range(len(self._treetoks)): x = right - 20*(len(self._treetoks)-i-1) if i == self._treetoks_index: fill = '#084' else: fill = '#fff' tag = c.create_polygon(x, y+10, x-5, y, x-10, y+10, fill=fill, outline='black') self._tree_tags.append(tag) # Set up a callback: show the tree if they click on its # triangle. def cb(event, self=self, i=i): self._treetoks_index = i self.draw_tree() c.tag_bind(tag, '', cb) def _draw_treetok(self, treetok, index, depth=0): """ :param index: The index of the first leaf in the tree. :return: The index of the first leaf after the tree. """ c = self._tree_canvas margin = ChartView._MARGIN # Draw the children child_xs = [] for child in treetok: if isinstance(child, Tree): child_x, index = self._draw_treetok(child, index, depth+1) child_xs.append(child_x) else: child_xs.append((2*index+1)*self._unitsize/2 + margin) index += 1 # If we have children, then get the node's x by averaging their # node x's. Otherwise, make room for ourselves. if child_xs: nodex = sum(child_xs)/len(child_xs) else: # [XX] breaks for null productions. nodex = (2*index+1)*self._unitsize/2 + margin index += 1 # Draw the node nodey = depth * (ChartView._TREE_LEVEL_SIZE + self._text_height) tag = c.create_text(nodex, nodey, anchor='n', justify='center', text=str(treetok.label()), fill='#042', font=self._boldfont) self._tree_tags.append(tag) # Draw lines to the children. childy = nodey + ChartView._TREE_LEVEL_SIZE + self._text_height for childx, child in zip(child_xs, treetok): if isinstance(child, Tree) and child: # A "real" tree token: tag = c.create_line(nodex, nodey + self._text_height, childx, childy, width=2, fill='#084') self._tree_tags.append(tag) if isinstance(child, Tree) and not child: # An unexpanded tree token: tag = c.create_line(nodex, nodey + self._text_height, childx, childy, width=2, fill='#048', dash='2 3') self._tree_tags.append(tag) if not isinstance(child, Tree): # A leaf: tag = c.create_line(nodex, nodey + self._text_height, childx, 10000, width=2, fill='#084') self._tree_tags.append(tag) return nodex, index def draw(self): """ Draw everything (from scratch). """ if self._tree_canvas: self._tree_canvas.delete('all') self.draw_tree() if self._sentence_canvas: self._sentence_canvas.delete('all') self._draw_sentence() self._chart_canvas.delete('all') self._edgetags = {} # Redraw any edges we erased. for lvl in range(len(self._edgelevels)): for edge in self._edgelevels[lvl]: self._draw_edge(edge, lvl) for edge in self._chart: self._add_edge(edge) self._draw_loclines() def add_callback(self, event, func): self._callbacks.setdefault(event,{})[func] = 1 def remove_callback(self, event, func=None): if func is None: del self._callbacks[event] else: try: del self._callbacks[event][func] except: pass def _fire_callbacks(self, event, *args): if event not in self._callbacks: return for cb_func in list(self._callbacks[event].keys()): cb_func(*args) ####################################################################### # Edge Rules ####################################################################### # These version of the chart rules only apply to a specific edge. # This lets the user select an edge, and then apply a rule. class EdgeRule(object): """ To create an edge rule, make an empty base class that uses EdgeRule as the first base class, and the basic rule as the second base class. (Order matters!) """ def __init__(self, edge): super = self.__class__.__bases__[1] self._edge = edge self.NUM_EDGES = super.NUM_EDGES-1 def apply(self, chart, grammar, *edges): super = self.__class__.__bases__[1] edges += (self._edge,) for e in super.apply(self, chart, grammar, *edges): yield e def __str__(self): super = self.__class__.__bases__[1] return super.__str__(self) class TopDownPredictEdgeRule(EdgeRule, TopDownPredictRule): pass class BottomUpEdgeRule(EdgeRule, BottomUpPredictRule): pass class BottomUpLeftCornerEdgeRule(EdgeRule, BottomUpPredictCombineRule): pass class FundamentalEdgeRule(EdgeRule, SingleEdgeFundamentalRule): pass ####################################################################### # Chart Parser Application ####################################################################### class ChartParserApp(object): def __init__(self, grammar, tokens, title='Chart Parser Application'): # Initialize the parser self._init_parser(grammar, tokens) self._root = None try: # Create the root window. self._root = tkinter.Tk() self._root.title(title) self._root.bind('', self.destroy) # Set up some frames. frame3 = tkinter.Frame(self._root) frame2 = tkinter.Frame(self._root) frame1 = tkinter.Frame(self._root) frame3.pack(side='bottom', fill='none') frame2.pack(side='bottom', fill='x') frame1.pack(side='bottom', fill='both', expand=1) self._init_fonts(self._root) self._init_animation() self._init_chartview(frame1) self._init_rulelabel(frame2) self._init_buttons(frame3) self._init_menubar() self._matrix = None self._results = None # Set up keyboard bindings. self._init_bindings() except: print('Error creating Tree View') self.destroy() raise def destroy(self, *args): if self._root is None: return self._root.destroy() self._root = None def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._root.mainloop(*args, **kwargs) #//////////////////////////////////////////////////////////// # Initialization Helpers #//////////////////////////////////////////////////////////// def _init_parser(self, grammar, tokens): self._grammar = grammar self._tokens = tokens self._reset_parser() def _reset_parser(self): self._cp = SteppingChartParser(self._grammar) self._cp.initialize(self._tokens) self._chart = self._cp.chart() # Insert LeafEdges before the parsing starts. for _new_edge in LeafInitRule().apply(self._chart, self._grammar): pass # The step iterator -- use this to generate new edges self._cpstep = self._cp.step() # The currently selected edge self._selection = None def _init_fonts(self, root): # See: self._sysfont = tkinter.font.Font(font=tkinter.Button()["font"]) root.option_add("*Font", self._sysfont) # TWhat's our font size (default=same as sysfont) self._size = tkinter.IntVar(root) self._size.set(self._sysfont.cget('size')) self._boldfont = tkinter.font.Font(family='helvetica', weight='bold', size=self._size.get()) self._font = tkinter.font.Font(family='helvetica', size=self._size.get()) def _init_animation(self): # Are we stepping? (default=yes) self._step = tkinter.IntVar(self._root) self._step.set(1) # What's our animation speed (default=fast) self._animate = tkinter.IntVar(self._root) self._animate.set(3) # Default speed = fast # Are we currently animating? self._animating = 0 def _init_chartview(self, parent): self._cv = ChartView(self._chart, parent, draw_tree=1, draw_sentence=1) self._cv.add_callback('select', self._click_cv_edge) def _init_rulelabel(self, parent): ruletxt = 'Last edge generated by:' self._rulelabel1 = tkinter.Label(parent,text=ruletxt, font=self._boldfont) self._rulelabel2 = tkinter.Label(parent, width=40, relief='groove', anchor='w', font=self._boldfont) self._rulelabel1.pack(side='left') self._rulelabel2.pack(side='left') step = tkinter.Checkbutton(parent, variable=self._step, text='Step') step.pack(side='right') def _init_buttons(self, parent): frame1 = tkinter.Frame(parent) frame2 = tkinter.Frame(parent) frame1.pack(side='bottom', fill='x') frame2.pack(side='top', fill='none') tkinter.Button(frame1, text='Reset\nParser', background='#90c0d0', foreground='black', command=self.reset).pack(side='right') #Tkinter.Button(frame1, text='Pause', # background='#90c0d0', foreground='black', # command=self.pause).pack(side='left') tkinter.Button(frame1, text='Top Down\nStrategy', background='#90c0d0', foreground='black', command=self.top_down_strategy).pack(side='left') tkinter.Button(frame1, text='Bottom Up\nStrategy', background='#90c0d0', foreground='black', command=self.bottom_up_strategy).pack(side='left') tkinter.Button(frame1, text='Bottom Up\nLeft-Corner Strategy', background='#90c0d0', foreground='black', command=self.bottom_up_leftcorner_strategy).pack(side='left') tkinter.Button(frame2, text='Top Down Init\nRule', background='#90f090', foreground='black', command=self.top_down_init).pack(side='left') tkinter.Button(frame2, text='Top Down Predict\nRule', background='#90f090', foreground='black', command=self.top_down_predict).pack(side='left') tkinter.Frame(frame2, width=20).pack(side='left') tkinter.Button(frame2, text='Bottom Up Predict\nRule', background='#90f090', foreground='black', command=self.bottom_up).pack(side='left') tkinter.Frame(frame2, width=20).pack(side='left') tkinter.Button(frame2, text='Bottom Up Left-Corner\nPredict Rule', background='#90f090', foreground='black', command=self.bottom_up_leftcorner).pack(side='left') tkinter.Frame(frame2, width=20).pack(side='left') tkinter.Button(frame2, text='Fundamental\nRule', background='#90f090', foreground='black', command=self.fundamental).pack(side='left') def _init_bindings(self): self._root.bind('', self._cv.scroll_up) self._root.bind('', self._cv.scroll_down) self._root.bind('', self._cv.page_up) self._root.bind('', self._cv.page_down) self._root.bind('', self.destroy) self._root.bind('', self.destroy) self._root.bind('', self.help) self._root.bind('', self.save_chart) self._root.bind('', self.load_chart) self._root.bind('', self.reset) self._root.bind('t', self.top_down_strategy) self._root.bind('b', self.bottom_up_strategy) self._root.bind('c', self.bottom_up_leftcorner_strategy) self._root.bind('', self._stop_animation) self._root.bind('', self.edit_grammar) self._root.bind('', self.edit_sentence) # Animation speed control self._root.bind('-', lambda e,a=self._animate:a.set(1)) self._root.bind('=', lambda e,a=self._animate:a.set(2)) self._root.bind('+', lambda e,a=self._animate:a.set(3)) # Step control self._root.bind('s', lambda e,s=self._step:s.set(not s.get())) def _init_menubar(self): menubar = tkinter.Menu(self._root) filemenu = tkinter.Menu(menubar, tearoff=0) filemenu.add_command(label='Save Chart', underline=0, command=self.save_chart, accelerator='Ctrl-s') filemenu.add_command(label='Load Chart', underline=0, command=self.load_chart, accelerator='Ctrl-o') filemenu.add_command(label='Reset Chart', underline=0, command=self.reset, accelerator='Ctrl-r') filemenu.add_separator() filemenu.add_command(label='Save Grammar', command=self.save_grammar) filemenu.add_command(label='Load Grammar', command=self.load_grammar) filemenu.add_separator() filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x') menubar.add_cascade(label='File', underline=0, menu=filemenu) editmenu = tkinter.Menu(menubar, tearoff=0) editmenu.add_command(label='Edit Grammar', underline=5, command=self.edit_grammar, accelerator='Ctrl-g') editmenu.add_command(label='Edit Text', underline=5, command=self.edit_sentence, accelerator='Ctrl-t') menubar.add_cascade(label='Edit', underline=0, menu=editmenu) viewmenu = tkinter.Menu(menubar, tearoff=0) viewmenu.add_command(label='Chart Matrix', underline=6, command=self.view_matrix) viewmenu.add_command(label='Results', underline=0, command=self.view_results) menubar.add_cascade(label='View', underline=0, menu=viewmenu) rulemenu = tkinter.Menu(menubar, tearoff=0) rulemenu.add_command(label='Top Down Strategy', underline=0, command=self.top_down_strategy, accelerator='t') rulemenu.add_command(label='Bottom Up Strategy', underline=0, command=self.bottom_up_strategy, accelerator='b') rulemenu.add_command(label='Bottom Up Left-Corner Strategy', underline=0, command=self.bottom_up_leftcorner_strategy, accelerator='c') rulemenu.add_separator() rulemenu.add_command(label='Bottom Up Rule', command=self.bottom_up) rulemenu.add_command(label='Bottom Up Left-Corner Rule', command=self.bottom_up_leftcorner) rulemenu.add_command(label='Top Down Init Rule', command=self.top_down_init) rulemenu.add_command(label='Top Down Predict Rule', command=self.top_down_predict) rulemenu.add_command(label='Fundamental Rule', command=self.fundamental) menubar.add_cascade(label='Apply', underline=0, menu=rulemenu) animatemenu = tkinter.Menu(menubar, tearoff=0) animatemenu.add_checkbutton(label="Step", underline=0, variable=self._step, accelerator='s') animatemenu.add_separator() animatemenu.add_radiobutton(label="No Animation", underline=0, variable=self._animate, value=0) animatemenu.add_radiobutton(label="Slow Animation", underline=0, variable=self._animate, value=1, accelerator='-') animatemenu.add_radiobutton(label="Normal Animation", underline=0, variable=self._animate, value=2, accelerator='=') animatemenu.add_radiobutton(label="Fast Animation", underline=0, variable=self._animate, value=3, accelerator='+') menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) zoommenu = tkinter.Menu(menubar, tearoff=0) zoommenu.add_radiobutton(label='Tiny', variable=self._size, underline=0, value=10, command=self.resize) zoommenu.add_radiobutton(label='Small', variable=self._size, underline=0, value=12, command=self.resize) zoommenu.add_radiobutton(label='Medium', variable=self._size, underline=0, value=14, command=self.resize) zoommenu.add_radiobutton(label='Large', variable=self._size, underline=0, value=18, command=self.resize) zoommenu.add_radiobutton(label='Huge', variable=self._size, underline=0, value=24, command=self.resize) menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu) helpmenu = tkinter.Menu(menubar, tearoff=0) helpmenu.add_command(label='About', underline=0, command=self.about) helpmenu.add_command(label='Instructions', underline=0, command=self.help, accelerator='F1') menubar.add_cascade(label='Help', underline=0, menu=helpmenu) self._root.config(menu=menubar) #//////////////////////////////////////////////////////////// # Selection Handling #//////////////////////////////////////////////////////////// def _click_cv_edge(self, edge): if edge != self._selection: # Clicking on a new edge selects it. self._select_edge(edge) else: # Repeated clicks on one edge cycle its trees. self._cv.cycle_tree() # [XX] this can get confused if animation is running # faster than the callbacks... def _select_matrix_edge(self, edge): self._select_edge(edge) self._cv.view_edge(edge) def _select_edge(self, edge): self._selection = edge # Update the chart view. self._cv.markonly_edge(edge, '#f00') self._cv.draw_tree(edge) # Update the matrix view. if self._matrix: self._matrix.markonly_edge(edge) if self._matrix: self._matrix.view_edge(edge) def _deselect_edge(self): self._selection = None # Update the chart view. self._cv.unmark_edge() self._cv.erase_tree() # Update the matrix view if self._matrix: self._matrix.unmark_edge() def _show_new_edge(self, edge): self._display_rule(self._cp.current_chartrule()) # Update the chart view. self._cv.update() self._cv.draw_tree(edge) self._cv.markonly_edge(edge, '#0df') self._cv.view_edge(edge) # Update the matrix view. if self._matrix: self._matrix.update() if self._matrix: self._matrix.markonly_edge(edge) if self._matrix: self._matrix.view_edge(edge) # Update the results view. if self._results: self._results.update(edge) #//////////////////////////////////////////////////////////// # Help/usage #//////////////////////////////////////////////////////////// def help(self, *e): self._animating = 0 # The default font's not very legible; try using 'fixed' instead. try: ShowText(self._root, 'Help: Chart Parser Application', (__doc__ or '').strip(), width=75, font='fixed') except: ShowText(self._root, 'Help: Chart Parser Application', (__doc__ or '').strip(), width=75) def about(self, *e): ABOUT = ("NLTK Chart Parser Application\n"+ "Written by Edward Loper") tkinter.messagebox.showinfo('About: Chart Parser Application', ABOUT) #//////////////////////////////////////////////////////////// # File Menu #//////////////////////////////////////////////////////////// CHART_FILE_TYPES = [('Pickle file', '.pickle'), ('All files', '*')] GRAMMAR_FILE_TYPES = [('Plaintext grammar file', '.cfg'), ('Pickle file', '.pickle'), ('All files', '*')] def load_chart(self, *args): "Load a chart from a pickle file" filename = askopenfilename(filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle') if not filename: return try: with open(filename, 'rb') as infile: chart = pickle.load(infile) self._chart = chart self._cv.update(chart) if self._matrix: self._matrix.set_chart(chart) if self._matrix: self._matrix.deselect_cell() if self._results: self._results.set_chart(chart) self._cp.set_chart(chart) except Exception as e: raise tkinter.messagebox.showerror('Error Loading Chart', 'Unable to open file: %r' % filename) def save_chart(self, *args): "Save a chart to a pickle file" filename = asksaveasfilename(filetypes=self.CHART_FILE_TYPES, defaultextension='.pickle') if not filename: return try: with open(filename, 'wb') as outfile: pickle.dump(self._chart, outfile) except Exception as e: raise tkinter.messagebox.showerror('Error Saving Chart', 'Unable to open file: %r' % filename) def load_grammar(self, *args): "Load a grammar from a pickle file" filename = askopenfilename(filetypes=self.GRAMMAR_FILE_TYPES, defaultextension='.cfg') if not filename: return try: if filename.endswith('.pickle'): with open(filename, 'rb') as infile: grammar = pickle.load(infile) else: with open(filename, 'r') as infile: grammar = CFG.fromstring(infile.read()) self.set_grammar(grammar) except Exception as e: tkinter.messagebox.showerror('Error Loading Grammar', 'Unable to open file: %r' % filename) def save_grammar(self, *args): filename = asksaveasfilename(filetypes=self.GRAMMAR_FILE_TYPES, defaultextension='.cfg') if not filename: return try: if filename.endswith('.pickle'): with open(filename, 'wb') as outfile: pickle.dump((self._chart, self._tokens), outfile) else: with open(filename, 'w') as outfile: prods = self._grammar.productions() start = [p for p in prods if p.lhs() == self._grammar.start()] rest = [p for p in prods if p.lhs() != self._grammar.start()] for prod in start: outfile.write('%s\n' % prod) for prod in rest: outfile.write('%s\n' % prod) except Exception as e: tkinter.messagebox.showerror('Error Saving Grammar', 'Unable to open file: %r' % filename) def reset(self, *args): self._animating = 0 self._reset_parser() self._cv.update(self._chart) if self._matrix: self._matrix.set_chart(self._chart) if self._matrix: self._matrix.deselect_cell() if self._results: self._results.set_chart(self._chart) #//////////////////////////////////////////////////////////// # Edit #//////////////////////////////////////////////////////////// def edit_grammar(self, *e): CFGEditor(self._root, self._grammar, self.set_grammar) def set_grammar(self, grammar): self._grammar = grammar self._cp.set_grammar(grammar) if self._results: self._results.set_grammar(grammar) def edit_sentence(self, *e): sentence = " ".join(self._tokens) title = 'Edit Text' instr = 'Enter a new sentence to parse.' EntryDialog(self._root, sentence, instr, self.set_sentence, title) def set_sentence(self, sentence): self._tokens = list(sentence.split()) self.reset() #//////////////////////////////////////////////////////////// # View Menu #//////////////////////////////////////////////////////////// def view_matrix(self, *e): if self._matrix is not None: self._matrix.destroy() self._matrix = ChartMatrixView(self._root, self._chart) self._matrix.add_callback('select', self._select_matrix_edge) def view_results(self, *e): if self._results is not None: self._results.destroy() self._results = ChartResultsView(self._root, self._chart, self._grammar) #//////////////////////////////////////////////////////////// # Zoom Menu #//////////////////////////////////////////////////////////// def resize(self): self._animating = 0 self.set_font_size(self._size.get()) def set_font_size(self, size): self._cv.set_font_size(size) self._font.configure(size=-abs(size)) self._boldfont.configure(size=-abs(size)) self._sysfont.configure(size=-abs(size)) def get_font_size(self): return abs(self._size.get()) #//////////////////////////////////////////////////////////// # Parsing #//////////////////////////////////////////////////////////// def apply_strategy(self, strategy, edge_strategy=None): # If we're animating, then stop. if self._animating: self._animating = 0 return # Clear the rule display & mark. self._display_rule(None) #self._cv.unmark_edge() if self._step.get(): selection = self._selection if (selection is not None) and (edge_strategy is not None): # Apply the given strategy to the selected edge. self._cp.set_strategy([edge_strategy(selection)]) newedge = self._apply_strategy() # If it failed, then clear the selection. if newedge is None: self._cv.unmark_edge() self._selection = None else: self._cp.set_strategy(strategy) self._apply_strategy() else: self._cp.set_strategy(strategy) if self._animate.get(): self._animating = 1 self._animate_strategy() else: for edge in self._cpstep: if edge is None: break self._cv.update() if self._matrix: self._matrix.update() if self._results: self._results.update() def _stop_animation(self, *e): self._animating = 0 def _animate_strategy(self, speed=1): if self._animating == 0: return if self._apply_strategy() is not None: if self._animate.get() == 0 or self._step.get() == 1: return if self._animate.get() == 1: self._root.after(3000, self._animate_strategy) elif self._animate.get() == 2: self._root.after(1000, self._animate_strategy) else: self._root.after(20, self._animate_strategy) def _apply_strategy(self): new_edge = next(self._cpstep) if new_edge is not None: self._show_new_edge(new_edge) return new_edge def _display_rule(self, rule): if rule is None: self._rulelabel2['text'] = '' else: name = str(rule) self._rulelabel2['text'] = name size = self._cv.get_font_size() #//////////////////////////////////////////////////////////// # Parsing Strategies #//////////////////////////////////////////////////////////// # Basic rules: _TD_INIT = [TopDownInitRule()] _TD_PREDICT = [TopDownPredictRule()] _BU_RULE = [BottomUpPredictRule()] _BU_LC_RULE = [BottomUpPredictCombineRule()] _FUNDAMENTAL = [SingleEdgeFundamentalRule()] # Complete strategies: _TD_STRATEGY = _TD_INIT + _TD_PREDICT + _FUNDAMENTAL _BU_STRATEGY = _BU_RULE + _FUNDAMENTAL _BU_LC_STRATEGY = _BU_LC_RULE + _FUNDAMENTAL # Button callback functions: def top_down_init(self, *e): self.apply_strategy(self._TD_INIT, None) def top_down_predict(self, *e): self.apply_strategy(self._TD_PREDICT, TopDownPredictEdgeRule) def bottom_up(self, *e): self.apply_strategy(self._BU_RULE, BottomUpEdgeRule) def bottom_up_leftcorner(self, *e): self.apply_strategy(self._BU_LC_RULE, BottomUpLeftCornerEdgeRule) def fundamental(self, *e): self.apply_strategy(self._FUNDAMENTAL, FundamentalEdgeRule) def bottom_up_strategy(self, *e): self.apply_strategy(self._BU_STRATEGY, BottomUpEdgeRule) def bottom_up_leftcorner_strategy(self, *e): self.apply_strategy(self._BU_LC_STRATEGY, BottomUpLeftCornerEdgeRule) def top_down_strategy(self, *e): self.apply_strategy(self._TD_STRATEGY, TopDownPredictEdgeRule) def app(): grammar = CFG.fromstring(""" # Grammatical productions. S -> NP VP VP -> VP PP | V NP | V NP -> Det N | NP PP PP -> P NP # Lexical productions. NP -> 'John' | 'I' Det -> 'the' | 'my' | 'a' N -> 'dog' | 'cookie' | 'table' | 'cake' | 'fork' V -> 'ate' | 'saw' P -> 'on' | 'under' | 'with' """) sent = 'John ate the cake on the table with a fork' sent = 'John ate the cake on the table' tokens = list(sent.split()) print('grammar= (') for rule in grammar.productions(): print((' ', repr(rule)+',')) print(')') print(('tokens = %r' % tokens)) print('Calling "ChartParserApp(grammar, tokens)"...') ChartParserApp(grammar, tokens).mainloop() if __name__ == '__main__': app() # Chart comparer: #charts = ['/tmp/earley.pickle', # '/tmp/topdown.pickle', # '/tmp/bottomup.pickle'] #ChartComparer(*charts).mainloop() #import profile #profile.run('demo2()', '/tmp/profile.out') #import pstats #p = pstats.Stats('/tmp/profile.out') #p.strip_dirs().sort_stats('time', 'cum').print_stats(60) #p.strip_dirs().sort_stats('cum', 'time').print_stats(60) __all__ = ['app'] nltk-3.1/nltk/app/chunkparser_app.py0000644000076500000240000015405612607224144017360 0ustar sbstaff00000000000000# Natural Language Toolkit: Regexp Chunk Parser Application # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring the regular expression based chunk parser ``nltk.chunk.RegexpChunkParser``. """ # Todo: Add a way to select the development set from the menubar. This # might just need to be a selection box (conll vs treebank etc) plus # configuration parameters to select what's being chunked (eg VP vs NP) # and what part of the data is being used as the development set. import nltk.compat import time import textwrap import re import random import tkinter.filedialog, tkinter.font from tkinter import (Button, Canvas, Checkbutton, Frame, IntVar, Label, Menu, Scrollbar, Text, Tk) from nltk.tree import Tree from nltk.util import in_idle from nltk.draw.util import ShowText from nltk.corpus import conll2000, treebank_chunk from nltk.chunk import ChunkScore, RegexpChunkParser from nltk.chunk.regexp import RegexpChunkRule class RegexpChunkApp(object): """ A graphical tool for exploring the regular expression based chunk parser ``nltk.chunk.RegexpChunkParser``. See ``HELP`` for instructional text. """ ##///////////////////////////////////////////////////////////////// ## Help Text ##///////////////////////////////////////////////////////////////// #: A dictionary mapping from part of speech tags to descriptions, #: which is used in the help text. (This should probably live with #: the conll and/or treebank corpus instead.) TAGSET = { 'CC': 'Coordinating conjunction', 'PRP$': 'Possessive pronoun', 'CD': 'Cardinal number', 'RB': 'Adverb', 'DT': 'Determiner', 'RBR': 'Adverb, comparative', 'EX': 'Existential there', 'RBS': 'Adverb, superlative', 'FW': 'Foreign word', 'RP': 'Particle', 'JJ': 'Adjective', 'TO': 'to', 'JJR': 'Adjective, comparative', 'UH': 'Interjection', 'JJS': 'Adjective, superlative', 'VB': 'Verb, base form', 'LS': 'List item marker', 'VBD': 'Verb, past tense', 'MD': 'Modal', 'NNS': 'Noun, plural', 'NN': 'Noun, singular or masps', 'VBN': 'Verb, past participle', 'VBZ': 'Verb,3rd ps. sing. present', 'NNP': 'Proper noun, singular', 'NNPS': 'Proper noun plural', 'WDT': 'wh-determiner', 'PDT': 'Predeterminer', 'WP': 'wh-pronoun', 'POS': 'Possessive ending', 'WP$': 'Possessive wh-pronoun', 'PRP': 'Personal pronoun', 'WRB': 'wh-adverb', '(': 'open parenthesis', ')': 'close parenthesis', '``': 'open quote', ',': 'comma', "''": 'close quote', '.': 'period', '#': 'pound sign (currency marker)', '$': 'dollar sign (currency marker)', 'IN': 'Preposition/subord. conjunction', 'SYM': 'Symbol (mathematical or scientific)', 'VBG': 'Verb, gerund/present participle', 'VBP': 'Verb, non-3rd ps. sing. present', ':': 'colon', } #: Contents for the help box. This is a list of tuples, one for #: each help page, where each tuple has four elements: #: - A title (displayed as a tab) #: - A string description of tabstops (see Tkinter.Text for details) #: - The text contents for the help page. You can use expressions #: like ... to colorize the text; see ``HELP_AUTOTAG`` #: for a list of tags you can use for colorizing. HELP = [ ('Help', '20', "Welcome to the regular expression chunk-parser grammar editor. " "You can use this editor to develop and test chunk parser grammars " "based on NLTK's RegexpChunkParser class.\n\n" # Help box. "Use this box ('Help') to learn more about the editor; click on the " "tabs for help on specific topics:" "\n" "Rules: grammar rule types\n" "Regexps: regular expression syntax\n" "Tags: part of speech tags\n\n" # Grammar. "Use the upper-left box ('Grammar') to edit your grammar. " "Each line of your grammar specifies a single 'rule', " "which performs an action such as creating a chunk or merging " "two chunks.\n\n" # Dev set. "The lower-left box ('Development Set') runs your grammar on the " "development set, and displays the results. " "Your grammar's chunks are highlighted, and " "the correct (gold standard) chunks are " "underlined. If they " "match, they are displayed in green; otherwise, " "they are displayed in red. The box displays a single " "sentence from the development set at a time; use the scrollbar or " "the next/previous buttons view additional sentences.\n\n" # Performance "The lower-right box ('Evaluation') tracks the performance of " "your grammar on the development set. The 'precision' axis " "indicates how many of your grammar's chunks are correct; and " "the 'recall' axis indicates how many of the gold standard " "chunks your system generated. Typically, you should try to " "design a grammar that scores high on both metrics. The " "exact precision and recall of the current grammar, as well " "as their harmonic mean (the 'f-score'), are displayed in " "the status bar at the bottom of the window." ), ('Rules', '10', "

{...regexp...}

" "\nChunk rule: creates new chunks from words matching " "regexp.\n\n" "

}...regexp...{

" "\nChink rule: removes words matching regexp from existing " "chunks.\n\n" "

...regexp1...}{...regexp2...

" "\nSplit rule: splits chunks that match regexp1 followed by " "regexp2 in two.\n\n" "

...regexp...{}...regexp...

" "\nMerge rule: joins consecutive chunks that match regexp1 " "and regexp2\n" ), ('Regexps', '10 60', #"Regular Expression Syntax Summary:\n\n" "

Pattern\t\tMatches...

\n" "" "\t<T>\ta word with tag T " "(where T may be a regexp).\n" "\tx?\tan optional x\n" "\tx+\ta sequence of 1 or more x's\n" "\tx*\ta sequence of 0 or more x's\n" "\tx|y\tx or y\n" "\t.\tmatches any character\n" "\t(x)\tTreats x as a group\n" "\t# x...\tTreats x... " "(to the end of the line) as a comment\n" "\t\\C\tmatches character C " "(useful when C is a special character " "like + or #)\n" "" "\n

Examples:

\n" "" '\t\n' '\t\tMatches "cow/NN"\n' '\t\tMatches "green/NN"\n' '\t\n' '\t\tMatches "eating/VBG"\n' '\t\tMatches "ate/VBD"\n' '\t
\n' '\t\tMatches "on/IN the/DT car/NN"\n' '\t?\n' '\t\tMatches "ran/VBD"\n' '\t\tMatches "slowly/RB ate/VBD"\n' '\t<\#> # This is a comment...\n' '\t\tMatches "#/# 100/CD"\n' "" ), ('Tags', '10 60', "

Part of Speech Tags:

\n" + '' + '<>' + # this gets auto-substituted w/ self.TAGSET '\n') ] HELP_AUTOTAG = [ ('red', dict(foreground='#a00')), ('green', dict(foreground='#080')), ('highlight', dict(background='#ddd')), ('underline', dict(underline=True)), ('h1', dict(underline=True)), ('indent', dict(lmargin1=20, lmargin2=20)), ('hangindent', dict(lmargin1=0, lmargin2=60)), ('var', dict(foreground='#88f')), ('regexp', dict(foreground='#ba7')), ('match', dict(foreground='#6a6')), ] ##///////////////////////////////////////////////////////////////// ## Config Parmeters ##///////////////////////////////////////////////////////////////// _EVAL_DELAY = 1 """If the user has not pressed any key for this amount of time (in seconds), and the current grammar has not been evaluated, then the eval demon will evaluate it.""" _EVAL_CHUNK = 15 """The number of sentences that should be evaluated by the eval demon each time it runs.""" _EVAL_FREQ = 0.2 """The frequency (in seconds) at which the eval demon is run""" _EVAL_DEMON_MIN = .02 """The minimum amount of time that the eval demon should take each time it runs -- if it takes less than this time, _EVAL_CHUNK will be modified upwards.""" _EVAL_DEMON_MAX = .04 """The maximum amount of time that the eval demon should take each time it runs -- if it takes more than this time, _EVAL_CHUNK will be modified downwards.""" _GRAMMARBOX_PARAMS = dict( width=40, height=12, background='#efe', highlightbackground='#efe', highlightthickness=1, relief='groove', border=2, wrap='word') _HELPBOX_PARAMS = dict( width=15, height=15, background='#efe', highlightbackground='#efe', foreground='#555', highlightthickness=1, relief='groove', border=2, wrap='word') _DEVSETBOX_PARAMS = dict( width=70, height=10, background='#eef', highlightbackground='#eef', highlightthickness=1, relief='groove', border=2, wrap='word', tabs=(30,)) _STATUS_PARAMS = dict( background='#9bb', relief='groove', border=2) _FONT_PARAMS = dict( family='helvetica', size=-20) _FRAME_PARAMS = dict( background='#777', padx=2, pady=2, border=3) _EVALBOX_PARAMS = dict( background='#eef', highlightbackground='#eef', highlightthickness=1, relief='groove', border=2, width=300, height=280) _BUTTON_PARAMS = dict( background='#777', activebackground='#777', highlightbackground='#777') _HELPTAB_BG_COLOR = '#aba' _HELPTAB_FG_COLOR = '#efe' _HELPTAB_FG_PARAMS = dict(background='#efe') _HELPTAB_BG_PARAMS = dict(background='#aba') _HELPTAB_SPACER = 6 def normalize_grammar(self, grammar): # Strip comments grammar = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', grammar) # Normalize whitespace grammar = re.sub(' +', ' ', grammar) grammar = re.sub('\n\s+', '\n', grammar) grammar = grammar.strip() # [xx] Hack: automatically backslash $! grammar = re.sub(r'([^\\])\$', r'\1\\$', grammar) return grammar def __init__(self, devset_name='conll2000', devset=None, grammar = '', chunk_label='NP', tagset=None): """ :param devset_name: The name of the development set; used for display & for save files. If either the name 'treebank' or the name 'conll2000' is used, and devset is None, then devset will be set automatically. :param devset: A list of chunked sentences :param grammar: The initial grammar to display. :param tagset: Dictionary from tags to string descriptions, used for the help page. Defaults to ``self.TAGSET``. """ self._chunk_label = chunk_label if tagset is None: tagset = self.TAGSET self.tagset = tagset # Named development sets: if devset is None: if devset_name == 'conll2000': devset = conll2000.chunked_sents('train.txt')#[:100] elif devset == 'treebank': devset = treebank_chunk.chunked_sents()#[:100] else: raise ValueError('Unknown development set %s' % devset_name) self.chunker = None """The chunker built from the grammar string""" self.grammar = grammar """The unparsed grammar string""" self.normalized_grammar = None """A normalized version of ``self.grammar``.""" self.grammar_changed = 0 """The last time() that the grammar was changed.""" self.devset = devset """The development set -- a list of chunked sentences.""" self.devset_name = devset_name """The name of the development set (for save files).""" self.devset_index = -1 """The index into the development set of the first instance that's currently being viewed.""" self._last_keypress = 0 """The time() when a key was most recently pressed""" self._history = [] """A list of (grammar, precision, recall, fscore) tuples for grammars that the user has already tried.""" self._history_index = 0 """When the user is scrolling through previous grammars, this is used to keep track of which grammar they're looking at.""" self._eval_grammar = None """The grammar that is being currently evaluated by the eval demon.""" self._eval_normalized_grammar = None """A normalized copy of ``_eval_grammar``.""" self._eval_index = 0 """The index of the next sentence in the development set that should be looked at by the eval demon.""" self._eval_score = ChunkScore(chunk_label=chunk_label) """The ``ChunkScore`` object that's used to keep track of the score of the current grammar on the development set.""" # Set up the main window. top = self.top = Tk() top.geometry('+50+50') top.title('Regexp Chunk Parser App') top.bind('', self.destroy) # Varaible that restricts how much of the devset we look at. self._devset_size = IntVar(top) self._devset_size.set(100) # Set up all the tkinter widgets self._init_fonts(top) self._init_widgets(top) self._init_bindings(top) self._init_menubar(top) self.grammarbox.focus() # If a grammar was given, then display it. if grammar: self.grammarbox.insert('end', grammar+'\n') self.grammarbox.mark_set('insert', '1.0') # Display the first item in the development set self.show_devset(0) self.update() def _init_bindings(self, top): top.bind('', self._devset_next) top.bind('', self._devset_prev) top.bind('', self.toggle_show_trace) top.bind('', self.update) top.bind('', lambda e: self.save_grammar()) top.bind('', lambda e: self.load_grammar()) self.grammarbox.bind('', self.toggle_show_trace) self.grammarbox.bind('', self._devset_next) self.grammarbox.bind('', self._devset_prev) # Redraw the eval graph when the window size changes self.evalbox.bind('', self._eval_plot) def _init_fonts(self, top): # TWhat's our font size (default=same as sysfont) self._size = IntVar(top) self._size.set(20) self._font = tkinter.font.Font(family='helvetica', size=-self._size.get()) self._smallfont = tkinter.font.Font(family='helvetica', size=-(int(self._size.get()*14/20))) def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label='Reset Application', underline=0, command=self.reset) filemenu.add_command(label='Save Current Grammar', underline=0, accelerator='Ctrl-s', command=self.save_grammar) filemenu.add_command(label='Load Grammar', underline=0, accelerator='Ctrl-o', command=self.load_grammar) filemenu.add_command(label='Save Grammar History', underline=13, command=self.save_history) filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q') menubar.add_cascade(label='File', underline=0, menu=filemenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_radiobutton(label='Tiny', variable=self._size, underline=0, value=10, command=self.resize) viewmenu.add_radiobutton(label='Small', variable=self._size, underline=0, value=16, command=self.resize) viewmenu.add_radiobutton(label='Medium', variable=self._size, underline=0, value=20, command=self.resize) viewmenu.add_radiobutton(label='Large', variable=self._size, underline=0, value=24, command=self.resize) viewmenu.add_radiobutton(label='Huge', variable=self._size, underline=0, value=34, command=self.resize) menubar.add_cascade(label='View', underline=0, menu=viewmenu) devsetmenu = Menu(menubar, tearoff=0) devsetmenu.add_radiobutton(label='50 sentences', variable=self._devset_size, value=50, command=self.set_devset_size) devsetmenu.add_radiobutton(label='100 sentences', variable=self._devset_size, value=100, command=self.set_devset_size) devsetmenu.add_radiobutton(label='200 sentences', variable=self._devset_size, value=200, command=self.set_devset_size) devsetmenu.add_radiobutton(label='500 sentences', variable=self._devset_size, value=500, command=self.set_devset_size) menubar.add_cascade(label='Development-Set', underline=0, menu=devsetmenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label='About', underline=0, command=self.about) menubar.add_cascade(label='Help', underline=0, menu=helpmenu) parent.config(menu=menubar) def toggle_show_trace(self, *e): if self._showing_trace: self.show_devset() else: self.show_trace() return 'break' _SCALE_N = 5 # center on the last 5 examples. _DRAW_LINES = False def _eval_plot(self, *e, **config): width = config.get('width', self.evalbox.winfo_width()) height = config.get('height', self.evalbox.winfo_height()) # Clear the canvas self.evalbox.delete('all') # Draw the precision & recall labels. tag = self.evalbox.create_text(10, height/2-10, justify='left', anchor='w', text='Precision') left, right = self.evalbox.bbox(tag)[2] + 5, width-10 tag = self.evalbox.create_text(left + (width-left)/2, height-10, anchor='s', text='Recall', justify='center') top, bot = 10, self.evalbox.bbox(tag)[1]-10 # Draw masks for clipping the plot. bg = self._EVALBOX_PARAMS['background'] self.evalbox.lower(self.evalbox.create_rectangle(0, 0, left-1, 5000, fill=bg, outline=bg)) self.evalbox.lower(self.evalbox.create_rectangle(0, bot+1, 5000, 5000, fill=bg, outline=bg)) # Calculate the plot's scale. if self._autoscale.get() and len(self._history) > 1: max_precision = max_recall = 0 min_precision = min_recall = 1 for i in range(1, min(len(self._history), self._SCALE_N+1)): grammar, precision, recall, fmeasure = self._history[-i] min_precision = min(precision, min_precision) min_recall = min(recall, min_recall) max_precision = max(precision, max_precision) max_recall = max(recall, max_recall) # if max_precision-min_precision > max_recall-min_recall: # min_recall -= (max_precision-min_precision)/2 # max_recall += (max_precision-min_precision)/2 # else: # min_precision -= (max_recall-min_recall)/2 # max_precision += (max_recall-min_recall)/2 # if min_recall < 0: # max_recall -= min_recall # min_recall = 0 # if min_precision < 0: # max_precision -= min_precision # min_precision = 0 min_precision = max(min_precision-.01, 0) min_recall = max(min_recall-.01, 0) max_precision = min(max_precision+.01, 1) max_recall = min(max_recall+.01, 1) else: min_precision = min_recall = 0 max_precision = max_recall = 1 # Draw the axis lines & grid lines for i in range(11): x = left + (right-left)*((i/10.-min_recall)/ (max_recall-min_recall)) y = bot - (bot-top)*((i/10.-min_precision)/ (max_precision-min_precision)) if left < x < right: self.evalbox.create_line(x, top, x, bot, fill='#888') if top < y < bot: self.evalbox.create_line(left, y, right, y, fill='#888') self.evalbox.create_line(left, top, left, bot) self.evalbox.create_line(left, bot, right, bot) # Display the plot's scale self.evalbox.create_text( left-3, bot, justify='right', anchor='se', text='%d%%' % (100*min_precision)) self.evalbox.create_text( left-3, top, justify='right', anchor='ne', text='%d%%' % (100*max_precision)) self.evalbox.create_text( left, bot+3, justify='center', anchor='nw', text='%d%%' % (100*min_recall)) self.evalbox.create_text( right, bot+3, justify='center', anchor='ne', text='%d%%' % (100*max_recall)) # Display the scores. prev_x = prev_y = None for i, (_, precision, recall, fscore) in enumerate(self._history): x = left + (right-left) * ((recall-min_recall) / (max_recall-min_recall)) y = bot - (bot-top) * ((precision-min_precision) / (max_precision-min_precision)) if i == self._history_index: self.evalbox.create_oval(x-2,y-2,x+2,y+2, fill='#0f0', outline='#000') self.status['text'] = ( 'Precision: %.2f%%\t' % (precision*100)+ 'Recall: %.2f%%\t' % (recall*100)+ 'F-score: %.2f%%' % (fscore*100)) else: self.evalbox.lower( self.evalbox.create_oval(x-2,y-2,x+2,y+2, fill='#afa', outline='#8c8')) if prev_x is not None and self._eval_lines.get(): self.evalbox.lower( self.evalbox.create_line(prev_x, prev_y, x, y, fill='#8c8')) prev_x, prev_y = x, y _eval_demon_running = False def _eval_demon(self): if self.top is None: return if self.chunker is None: self._eval_demon_running = False return # Note our starting time. t0 = time.time() # If are still typing, then wait for them to finish. if (time.time()-self._last_keypress < self._EVAL_DELAY and self.normalized_grammar != self._eval_normalized_grammar): self._eval_demon_running = True return self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon) # If the grammar changed, restart the evaluation. if self.normalized_grammar != self._eval_normalized_grammar: # Check if we've seen this grammar already. If so, then # just use the old evaluation values. for (g, p, r, f) in self._history: if self.normalized_grammar == self.normalize_grammar(g): self._history.append( (g, p, r, f) ) self._history_index = len(self._history) - 1 self._eval_plot() self._eval_demon_running = False self._eval_normalized_grammar = None return self._eval_index = 0 self._eval_score = ChunkScore(chunk_label=self._chunk_label) self._eval_grammar = self.grammar self._eval_normalized_grammar = self.normalized_grammar # If the grammar is empty, the don't bother evaluating it, or # recording it in history -- the score will just be 0. if self.normalized_grammar.strip() == '': #self._eval_index = self._devset_size.get() self._eval_demon_running = False return # Score the next set of examples for gold in self.devset[self._eval_index: min(self._eval_index+self._EVAL_CHUNK, self._devset_size.get())]: guess = self._chunkparse(gold.leaves()) self._eval_score.score(gold, guess) # update our index in the devset. self._eval_index += self._EVAL_CHUNK # Check if we're done if self._eval_index >= self._devset_size.get(): self._history.append( (self._eval_grammar, self._eval_score.precision(), self._eval_score.recall(), self._eval_score.f_measure()) ) self._history_index = len(self._history)-1 self._eval_plot() self._eval_demon_running = False self._eval_normalized_grammar = None else: progress = 100*self._eval_index/self._devset_size.get() self.status['text'] = ('Evaluating on Development Set (%d%%)' % progress) self._eval_demon_running = True self._adaptively_modify_eval_chunk(time.time() - t0) self.top.after(int(self._EVAL_FREQ*1000), self._eval_demon) def _adaptively_modify_eval_chunk(self, t): """ Modify _EVAL_CHUNK to try to keep the amount of time that the eval demon takes between _EVAL_DEMON_MIN and _EVAL_DEMON_MAX. :param t: The amount of time that the eval demon took. """ if t > self._EVAL_DEMON_MAX and self._EVAL_CHUNK > 5: self._EVAL_CHUNK = min(self._EVAL_CHUNK-1, max(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MAX/t)), self._EVAL_CHUNK-10)) elif t < self._EVAL_DEMON_MIN: self._EVAL_CHUNK = max(self._EVAL_CHUNK+1, min(int(self._EVAL_CHUNK*(self._EVAL_DEMON_MIN/t)), self._EVAL_CHUNK+10)) def _init_widgets(self, top): frame0 = Frame(top, **self._FRAME_PARAMS) frame0.grid_columnconfigure(0, weight=4) frame0.grid_columnconfigure(3, weight=2) frame0.grid_rowconfigure(1, weight=1) frame0.grid_rowconfigure(5, weight=1) # The grammar self.grammarbox = Text(frame0, font=self._font, **self._GRAMMARBOX_PARAMS) self.grammarlabel = Label(frame0, font=self._font, text='Grammar:', highlightcolor='black', background=self._GRAMMARBOX_PARAMS['background']) self.grammarlabel.grid(column=0, row=0, sticky='SW') self.grammarbox.grid(column=0, row=1, sticky='NEWS') # Scroll bar for grammar grammar_scrollbar = Scrollbar(frame0, command=self.grammarbox.yview) grammar_scrollbar.grid(column=1, row=1, sticky='NWS') self.grammarbox.config(yscrollcommand=grammar_scrollbar.set) # grammar buttons bg = self._FRAME_PARAMS['background'] frame3 = Frame(frame0, background=bg) frame3.grid(column=0, row=2, sticky='EW') Button(frame3, text='Prev Grammar', command=self._history_prev, **self._BUTTON_PARAMS).pack(side='left') Button(frame3, text='Next Grammar', command=self._history_next, **self._BUTTON_PARAMS).pack(side='left') # Help box self.helpbox = Text(frame0, font=self._smallfont, **self._HELPBOX_PARAMS) self.helpbox.grid(column=3, row=1, sticky='NEWS') self.helptabs = {} bg = self._FRAME_PARAMS['background'] helptab_frame = Frame(frame0, background=bg) helptab_frame.grid(column=3, row=0, sticky='SW') for i, (tab, tabstops, text) in enumerate(self.HELP): label = Label(helptab_frame, text=tab, font=self._smallfont) label.grid(column=i*2, row=0, sticky='S') #help_frame.grid_columnconfigure(i, weight=1) #label.pack(side='left') label.bind('', lambda e, tab=tab: self.show_help(tab)) self.helptabs[tab] = label Frame(helptab_frame, height=1, width=self._HELPTAB_SPACER, background=bg).grid(column=i*2+1, row=0) self.helptabs[self.HELP[0][0]].configure(font=self._font) self.helpbox.tag_config('elide', elide=True) for (tag, params) in self.HELP_AUTOTAG: self.helpbox.tag_config('tag-%s' % tag, **params) self.show_help(self.HELP[0][0]) # Scroll bar for helpbox help_scrollbar = Scrollbar(frame0, command=self.helpbox.yview) self.helpbox.config(yscrollcommand=help_scrollbar.set) help_scrollbar.grid(column=4, row=1, sticky='NWS') # The dev set frame4 = Frame(frame0, background=self._FRAME_PARAMS['background']) self.devsetbox = Text(frame4, font=self._font, **self._DEVSETBOX_PARAMS) self.devsetbox.pack(expand=True, fill='both') self.devsetlabel = Label(frame0, font=self._font, text='Development Set:', justify='right', background=self._DEVSETBOX_PARAMS['background']) self.devsetlabel.grid(column=0, row=4, sticky='SW') frame4.grid(column=0, row=5, sticky='NEWS') # dev set scrollbars self.devset_scroll = Scrollbar(frame0, command=self._devset_scroll) self.devset_scroll.grid(column=1, row=5, sticky='NWS') self.devset_xscroll = Scrollbar(frame4, command=self.devsetbox.xview, orient='horiz') self.devsetbox['xscrollcommand'] = self.devset_xscroll.set self.devset_xscroll.pack(side='bottom', fill='x') # dev set buttons bg = self._FRAME_PARAMS['background'] frame1 = Frame(frame0, background=bg) frame1.grid(column=0, row=7, sticky='EW') Button(frame1, text='Prev Example (Ctrl-p)', command=self._devset_prev, **self._BUTTON_PARAMS).pack(side='left') Button(frame1, text='Next Example (Ctrl-n)', command=self._devset_next, **self._BUTTON_PARAMS).pack(side='left') self.devset_button = Button(frame1, text='Show example', command=self.show_devset, state='disabled', **self._BUTTON_PARAMS) self.devset_button.pack(side='right') self.trace_button = Button(frame1, text='Show trace', command=self.show_trace, **self._BUTTON_PARAMS) self.trace_button.pack(side='right') # evaluation box self.evalbox = Canvas(frame0, **self._EVALBOX_PARAMS) label = Label(frame0, font=self._font, text='Evaluation:', justify='right', background=self._EVALBOX_PARAMS['background']) label.grid(column=3, row=4, sticky='SW') self.evalbox.grid(column=3, row=5, sticky='NEWS', columnspan=2) # evaluation box buttons bg = self._FRAME_PARAMS['background'] frame2 = Frame(frame0, background=bg) frame2.grid(column=3, row=7, sticky='EW') self._autoscale = IntVar(self.top) self._autoscale.set(False) Checkbutton(frame2, variable=self._autoscale, command=self._eval_plot, text='Zoom', **self._BUTTON_PARAMS).pack(side='left') self._eval_lines = IntVar(self.top) self._eval_lines.set(False) Checkbutton(frame2, variable=self._eval_lines, command=self._eval_plot, text='Lines', **self._BUTTON_PARAMS).pack(side='left') Button(frame2, text='History', **self._BUTTON_PARAMS).pack(side='right') # The status label self.status = Label(frame0, font=self._font, **self._STATUS_PARAMS) self.status.grid(column=0, row=9, sticky='NEW', padx=3, pady=2, columnspan=5) # Help box & devset box can't be edited. self.helpbox['state'] = 'disabled' self.devsetbox['state'] = 'disabled' # Spacers bg = self._FRAME_PARAMS['background'] Frame(frame0, height=10, width=0, background=bg).grid(column=0, row=3) Frame(frame0, height=0, width=10, background=bg).grid(column=2, row=0) Frame(frame0, height=6, width=0, background=bg).grid(column=0, row=8) # pack the frame. frame0.pack(fill='both', expand=True) # Set up colors for the devset box self.devsetbox.tag_config('true-pos', background='#afa', underline='True') self.devsetbox.tag_config('false-neg', underline='True', foreground='#800') self.devsetbox.tag_config('false-pos', background='#faa') self.devsetbox.tag_config('trace', foreground='#666', wrap='none') self.devsetbox.tag_config('wrapindent', lmargin2=30, wrap='none') self.devsetbox.tag_config('error', foreground='#800') # And for the grammarbox self.grammarbox.tag_config('error', background='#fec') self.grammarbox.tag_config('comment', foreground='#840') self.grammarbox.tag_config('angle', foreground='#00f') self.grammarbox.tag_config('brace', foreground='#0a0') self.grammarbox.tag_config('hangindent', lmargin1=0, lmargin2=40) _showing_trace = False def show_trace(self, *e): self._showing_trace = True self.trace_button['state'] = 'disabled' self.devset_button['state'] = 'normal' self.devsetbox['state'] = 'normal' #self.devsetbox['wrap'] = 'none' self.devsetbox.delete('1.0', 'end') self.devsetlabel['text']='Development Set (%d/%d)' % ( (self.devset_index+1, self._devset_size.get())) if self.chunker is None: self.devsetbox.insert('1.0', 'Trace: waiting for a valid grammar.') self.devsetbox.tag_add('error', '1.0', 'end') return # can't do anything more gold_tree = self.devset[self.devset_index] rules = self.chunker.rules() # Calculate the tag sequence tagseq = '\t' charnum = [1] for wordnum, (word, pos) in enumerate(gold_tree.leaves()): tagseq += '%s ' % pos charnum.append(len(tagseq)) self.charnum = dict(((i, j), charnum[j]) for i in range(len(rules)+1) for j in range(len(charnum))) self.linenum = dict((i,i*2+2) for i in range(len(rules)+1)) for i in range(len(rules)+1): if i == 0: self.devsetbox.insert('end', 'Start:\n') self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c') else: self.devsetbox.insert('end', 'Apply %s:\n' % rules[i-1]) self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c') # Display the tag sequence. self.devsetbox.insert('end', tagseq+'\n') self.devsetbox.tag_add('wrapindent','end -2c linestart','end -2c') # Run a partial parser, and extract gold & test chunks chunker = RegexpChunkParser(rules[:i]) test_tree = self._chunkparse(gold_tree.leaves()) gold_chunks = self._chunks(gold_tree) test_chunks = self._chunks(test_tree) # Compare them. for chunk in gold_chunks.intersection(test_chunks): self._color_chunk(i, chunk, 'true-pos') for chunk in gold_chunks - test_chunks: self._color_chunk(i, chunk, 'false-neg') for chunk in test_chunks - gold_chunks: self._color_chunk(i, chunk, 'false-pos') self.devsetbox.insert('end', 'Finished.\n') self.devsetbox.tag_add('trace', 'end -2c linestart', 'end -2c') # This is a hack, because the x-scrollbar isn't updating its # position right -- I'm not sure what the underlying cause is # though. (This is on OS X w/ python 2.5) self.top.after(100, self.devset_xscroll.set, 0, .3) def show_help(self, tab): self.helpbox['state'] = 'normal' self.helpbox.delete('1.0', 'end') for (name, tabstops, text) in self.HELP: if name == tab: text = text.replace('<>', '\n'.join( ('\t%s\t%s' % item for item in sorted(list(self.tagset.items()), key=lambda t_w:re.match('\w+',t_w[0]) and (0,t_w[0]) or (1,t_w[0]))))) self.helptabs[name].config(**self._HELPTAB_FG_PARAMS) self.helpbox.config(tabs=tabstops) self.helpbox.insert('1.0', text+'\n'*20) C = '1.0 + %d chars' for (tag, params) in self.HELP_AUTOTAG: pattern = '(?s)(<%s>)(.*?)()' % (tag, tag) for m in re.finditer(pattern, text): self.helpbox.tag_add('elide', C % m.start(1), C % m.end(1)) self.helpbox.tag_add('tag-%s' % tag, C % m.start(2), C % m.end(2)) self.helpbox.tag_add('elide', C % m.start(3), C % m.end(3)) else: self.helptabs[name].config(**self._HELPTAB_BG_PARAMS) self.helpbox['state'] = 'disabled' def _history_prev(self, *e): self._view_history(self._history_index-1) return 'break' def _history_next(self, *e): self._view_history(self._history_index+1) return 'break' def _view_history(self, index): # Bounds & sanity checking: index = max(0, min(len(self._history)-1, index)) if not self._history: return # Already viewing the requested history item? if index == self._history_index: return # Show the requested grammar. It will get added to _history # only if they edit it (causing self.update() to get run.) self.grammarbox['state'] = 'normal' self.grammarbox.delete('1.0', 'end') self.grammarbox.insert('end', self._history[index][0]) self.grammarbox.mark_set('insert', '1.0') self._history_index = index self._syntax_highlight_grammar(self._history[index][0]) # Record the normalized grammar & regenerate the chunker. self.normalized_grammar = self.normalize_grammar( self._history[index][0]) if self.normalized_grammar: rules = [RegexpChunkRule.fromstring(line) for line in self.normalized_grammar.split('\n')] else: rules = [] self.chunker = RegexpChunkParser(rules) # Show the score. self._eval_plot() # Update the devset box self._highlight_devset() if self._showing_trace: self.show_trace() # Update the grammar label if self._history_index < len(self._history)-1: self.grammarlabel['text'] = 'Grammar %s/%s:' % ( self._history_index+1, len(self._history)) else: self.grammarlabel['text'] = 'Grammar:' def _devset_next(self, *e): self._devset_scroll('scroll', 1, 'page') return 'break' def _devset_prev(self, *e): self._devset_scroll('scroll', -1, 'page') return 'break' def destroy(self, *e): if self.top is None: return self.top.destroy() self.top = None def _devset_scroll(self, command, *args): N = 1 # size of a page -- one sentence. showing_trace = self._showing_trace if command == 'scroll' and args[1].startswith('unit'): self.show_devset(self.devset_index+int(args[0])) elif command == 'scroll' and args[1].startswith('page'): self.show_devset(self.devset_index+N*int(args[0])) elif command == 'moveto': self.show_devset(int(float(args[0])*self._devset_size.get())) else: assert 0, 'bad scroll command %s %s' % (command, args) if showing_trace: self.show_trace() def show_devset(self, index=None): if index is None: index = self.devset_index # Bounds checking index = min(max(0, index), self._devset_size.get()-1) if index == self.devset_index and not self._showing_trace: return self.devset_index = index self._showing_trace = False self.trace_button['state'] = 'normal' self.devset_button['state'] = 'disabled' # Clear the text box. self.devsetbox['state'] = 'normal' self.devsetbox['wrap'] = 'word' self.devsetbox.delete('1.0', 'end') self.devsetlabel['text']='Development Set (%d/%d)' % ( (self.devset_index+1, self._devset_size.get())) # Add the sentences sample = self.devset[self.devset_index:self.devset_index+1] self.charnum = {} self.linenum = {0:1} for sentnum, sent in enumerate(sample): linestr = '' for wordnum, (word, pos) in enumerate(sent.leaves()): self.charnum[sentnum, wordnum] = len(linestr) linestr += '%s/%s ' % (word, pos) self.charnum[sentnum, wordnum+1] = len(linestr) self.devsetbox.insert('end', linestr[:-1]+'\n\n') # Highlight chunks in the dev set if self.chunker is not None: self._highlight_devset() self.devsetbox['state'] = 'disabled' # Update the scrollbar first = float(self.devset_index)/self._devset_size.get() last = float(self.devset_index+2)/self._devset_size.get() self.devset_scroll.set(first, last) def _chunks(self, tree): chunks = set() wordnum = 0 for child in tree: if isinstance(child, Tree): if child.label() == self._chunk_label: chunks.add( (wordnum, wordnum+len(child)) ) wordnum += len(child) else: wordnum += 1 return chunks def _syntax_highlight_grammar(self, grammar): if self.top is None: return self.grammarbox.tag_remove('comment', '1.0', 'end') self.grammarbox.tag_remove('angle', '1.0', 'end') self.grammarbox.tag_remove('brace', '1.0', 'end') self.grammarbox.tag_add('hangindent', '1.0', 'end') for lineno, line in enumerate(grammar.split('\n')): if not line.strip(): continue m = re.match(r'(\\.|[^#])*(#.*)?', line) comment_start = None if m.group(2): comment_start = m.start(2) s = '%d.%d' % (lineno+1, m.start(2)) e = '%d.%d' % (lineno+1, m.end(2)) self.grammarbox.tag_add('comment', s, e) for m in re.finditer('[<>{}]', line): if comment_start is not None and m.start() >= comment_start: break s = '%d.%d' % (lineno+1, m.start()) e = '%d.%d' % (lineno+1, m.end()) if m.group() in '<>': self.grammarbox.tag_add('angle', s, e) else: self.grammarbox.tag_add('brace', s, e) def _grammarcheck(self, grammar): if self.top is None: return self.grammarbox.tag_remove('error', '1.0', 'end') self._grammarcheck_errs = [] for lineno, line in enumerate(grammar.split('\n')): line = re.sub(r'((\\.|[^#])*)(#.*)?', r'\1', line) line = line.strip() if line: try: RegexpChunkRule.fromstring(line) except ValueError as e: self.grammarbox.tag_add('error', '%s.0' % (lineno+1), '%s.0 lineend' % (lineno+1)) self.status['text'] = '' def update(self, *event): # Record when update was called (for grammarcheck) if event: self._last_keypress = time.time() # Read the grammar from the Text box. self.grammar = grammar = self.grammarbox.get('1.0', 'end') # If the grammar hasn't changed, do nothing: normalized_grammar = self.normalize_grammar(grammar) if normalized_grammar == self.normalized_grammar: return else: self.normalized_grammar = normalized_grammar # If the grammar has changed, and we're looking at history, # then stop looking at history. if self._history_index < len(self._history)-1: self.grammarlabel['text'] = 'Grammar:' self._syntax_highlight_grammar(grammar) # The grammar has changed; try parsing it. If it doesn't # parse, do nothing. (flag error location?) try: # Note: the normalized grammar has no blank lines. if normalized_grammar: rules = [RegexpChunkRule.fromstring(line) for line in normalized_grammar.split('\n')] else: rules = [] except ValueError as e: # Use the un-normalized grammar for error highlighting. self._grammarcheck(grammar) self.chunker = None return self.chunker = RegexpChunkParser(rules) self.grammarbox.tag_remove('error', '1.0', 'end') self.grammar_changed = time.time() # Display the results if self._showing_trace: self.show_trace() else: self._highlight_devset() # Start the eval demon if not self._eval_demon_running: self._eval_demon() def _highlight_devset(self, sample=None): if sample is None: sample = self.devset[self.devset_index:self.devset_index+1] self.devsetbox.tag_remove('true-pos', '1.0', 'end') self.devsetbox.tag_remove('false-neg', '1.0', 'end') self.devsetbox.tag_remove('false-pos', '1.0', 'end') # Run the grammar on the test cases. for sentnum, gold_tree in enumerate(sample): # Run the chunk parser test_tree = self._chunkparse(gold_tree.leaves()) # Extract gold & test chunks gold_chunks = self._chunks(gold_tree) test_chunks = self._chunks(test_tree) # Compare them. for chunk in gold_chunks.intersection(test_chunks): self._color_chunk(sentnum, chunk, 'true-pos') for chunk in gold_chunks - test_chunks: self._color_chunk(sentnum, chunk, 'false-neg') for chunk in test_chunks - gold_chunks: self._color_chunk(sentnum, chunk, 'false-pos') def _chunkparse(self, words): try: return self.chunker.parse(words) except (ValueError, IndexError) as e: # There's an error somewhere in the grammar, but we're not sure # exactly where, so just mark the whole grammar as bad. # E.g., this is caused by: "({})" self.grammarbox.tag_add('error', '1.0', 'end') # Treat it as tagging nothing: return words def _color_chunk(self, sentnum, chunk, tag): start, end = chunk self.devsetbox.tag_add(tag, '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, start]), '%s.%s' % (self.linenum[sentnum], self.charnum[sentnum, end]-1)) def reset(self): # Clear various variables self.chunker = None self.grammar = None self.normalized_grammar = None self.grammar_changed = 0 self._history = [] self._history_index = 0 # Update the on-screen display. self.grammarbox.delete('1.0', 'end') self.show_devset(0) self.update() #self._eval_plot() SAVE_GRAMMAR_TEMPLATE = ( '# Regexp Chunk Parsing Grammar\n' '# Saved %(date)s\n' '#\n' '# Development set: %(devset)s\n' '# Precision: %(precision)s\n' '# Recall: %(recall)s\n' '# F-score: %(fscore)s\n\n' '%(grammar)s\n') def save_grammar(self, filename=None): if not filename: ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')] filename = tkinter.filedialog.asksaveasfilename(filetypes=ftypes, defaultextension='.chunk') if not filename: return if (self._history and self.normalized_grammar == self.normalize_grammar(self._history[-1][0])): precision, recall, fscore = ['%.2f%%' % (100*v) for v in self._history[-1][1:]] elif self.chunker is None: precision = recall = fscore = 'Grammar not well formed' else: precision = recall = fscore = 'Not finished evaluation yet' with open(filename, 'w') as outfile: outfile.write(self.SAVE_GRAMMAR_TEMPLATE % dict( date=time.ctime(), devset=self.devset_name, precision=precision, recall=recall, fscore=fscore, grammar=self.grammar.strip())) def load_grammar(self, filename=None): if not filename: ftypes = [('Chunk Gramamr', '.chunk'), ('All files', '*')] filename = tkinter.filedialog.askopenfilename(filetypes=ftypes, defaultextension='.chunk') if not filename: return self.grammarbox.delete('1.0', 'end') self.update() with open(filename, 'r') as infile: grammar = infile.read() grammar = re.sub('^\# Regexp Chunk Parsing Grammar[\s\S]*' 'F-score:.*\n', '', grammar).lstrip() self.grammarbox.insert('1.0', grammar) self.update() def save_history(self, filename=None): if not filename: ftypes = [('Chunk Gramamr History', '.txt'), ('All files', '*')] filename = tkinter.filedialog.asksaveasfilename(filetypes=ftypes, defaultextension='.txt') if not filename: return with open(filename, 'w') as outfile: outfile.write('# Regexp Chunk Parsing Grammar History\n') outfile.write('# Saved %s\n' % time.ctime()) outfile.write('# Development set: %s\n' % self.devset_name) for i, (g, p, r, f) in enumerate(self._history): hdr = ('Grammar %d/%d (precision=%.2f%%, recall=%.2f%%, ' 'fscore=%.2f%%)' % (i+1, len(self._history), p*100, r*100, f*100)) outfile.write('\n%s\n' % hdr) outfile.write(''.join(' %s\n' % line for line in g.strip().split())) if not (self._history and self.normalized_grammar == self.normalize_grammar(self._history[-1][0])): if self.chunker is None: outfile.write('\nCurrent Grammar (not well-formed)\n') else: outfile.write('\nCurrent Grammar (not evaluated)\n') outfile.write(''.join(' %s\n' % line for line in self.grammar.strip().split())) def about(self, *e): ABOUT = ("NLTK RegExp Chunk Parser Application\n"+ "Written by Edward Loper") TITLE = 'About: Regular Expression Chunk Parser Application' try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self.top, TITLE, ABOUT) def set_devset_size(self, size=None): if size is not None: self._devset_size.set(size) self._devset_size.set(min(len(self.devset), self._devset_size.get())) self.show_devset(1) self.show_devset(0) # what about history? Evaluated at diff dev set sizes! def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._smallfont.configure(size=min(-10, -(abs(size))*14/20)) def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self.top.mainloop(*args, **kwargs) def app(): RegexpChunkApp().mainloop() if __name__ == '__main__': app() __all__ = ['app'] nltk-3.1/nltk/app/collocations_app.py0000644000076500000240000003261412607224144017517 0ustar sbstaff00000000000000# Natural Language Toolkit: Collocations Application # Much of the GUI code is imported from concordance.py; We intend to merge these tools together # Copyright (C) 2001-2015 NLTK Project # Author: Sumukh Ghodke # URL: # For license information, see LICENSE.TXT # import nltk.compat import threading import tkinter.font if nltk.compat.PY3: import queue as q else: import Queue as q from tkinter import (Button, END, Frame, IntVar, LEFT, Label, Menu, OptionMenu, SUNKEN, Scrollbar, StringVar, Text, Tk) from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank, alpino, indian, floresta, mac_morpho, machado, cess_esp) from nltk.util import in_idle from nltk.probability import FreqDist CORPUS_LOADED_EVENT = '<>' ERROR_LOADING_CORPUS_EVENT = '<>' POLL_INTERVAL = 100 _DEFAULT = 'English: Brown Corpus (Humor)' _CORPORA = { 'Catalan: CESS-CAT Corpus': lambda: cess_cat.words(), 'English: Brown Corpus': lambda: brown.words(), 'English: Brown Corpus (Press)': lambda: brown.words(categories=['news', 'editorial', 'reviews']), 'English: Brown Corpus (Religion)': lambda: brown.words(categories='religion'), 'English: Brown Corpus (Learned)': lambda: brown.words(categories='learned'), 'English: Brown Corpus (Science Fiction)': lambda: brown.words(categories='science_fiction'), 'English: Brown Corpus (Romance)': lambda: brown.words(categories='romance'), 'English: Brown Corpus (Humor)': lambda: brown.words(categories='humor'), 'English: NPS Chat Corpus': lambda: nps_chat.words(), 'English: Wall Street Journal Corpus': lambda: treebank.words(), 'Chinese: Sinica Corpus': lambda: sinica_treebank.words(), 'Dutch: Alpino Corpus': lambda: alpino.words(), 'Hindi: Indian Languages Corpus': lambda: indian.words(files='hindi.pos'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.words(), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.words(), 'Portuguese: Machado Corpus (Brazil)': lambda: machado.words(), 'Spanish: CESS-ESP Corpus': lambda: cess_esp.words() } class CollocationsView: _BACKGROUND_COLOUR='#FFF' #white def __init__(self): self.queue = q.Queue() self.model = CollocationsModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS) self.after = self.top.after(POLL_INTERVAL, self._poll) def _init_top(self, top): top.geometry('550x650+50+50') top.title('NLTK Collocations List') top.bind('', self.destroy) top.protocol('WM_DELETE_WINDOW', self.destroy) top.minsize(550,650) def _init_widgets(self, parent): self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)) self._init_corpus_select(self.main_frame) self._init_results_box(self.main_frame) self._init_paging(self.main_frame) self._init_status(self.main_frame) self.main_frame.pack(fill='both', expand=True) def _init_corpus_select(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.var = StringVar(innerframe) self.var.set(self.model.DEFAULT_CORPUS) Label(innerframe, justify=LEFT, text=' Corpus: ', background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left') other_corpora = list(self.model.CORPORA.keys()).remove(self.model.DEFAULT_CORPUS) om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora()) om['borderwidth'] = 0 om['highlightthickness'] = 1 om.pack(side='left') innerframe.pack(side='top', fill='x', anchor='n') def _init_status(self, parent): self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0) self.status.pack(side='top', anchor='sw') def _init_menubar(self): self._result_size = IntVar(self.top) menubar = Menu(self.top) filemenu = Menu(menubar, tearoff=0, borderwidth=0) filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q') menubar.add_cascade(label='File', underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) rescntmenu = Menu(editmenu, tearoff=0) rescntmenu.add_radiobutton(label='20', variable=self._result_size, underline=0, value=20, command=self.set_result_size) rescntmenu.add_radiobutton(label='50', variable=self._result_size, underline=0, value=50, command=self.set_result_size) rescntmenu.add_radiobutton(label='100', variable=self._result_size, underline=0, value=100, command=self.set_result_size) rescntmenu.invoke(1) editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu) menubar.add_cascade(label='Edit', underline=0, menu=editmenu) self.top.config(menu=menubar) def set_result_size(self, **kwargs): self.model.result_count = self._result_size.get() def _init_results_box(self, parent): innerframe = Frame(parent) i1 = Frame(innerframe) i2 = Frame(innerframe) vscrollbar = Scrollbar(i1, borderwidth=1) hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz') self.results_box = Text(i1, font=tkinter.font.Font(family='courier', size='16'), state='disabled', borderwidth=1, yscrollcommand=vscrollbar.set, xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1) self.results_box.pack(side='left', fill='both', expand=True) vscrollbar.pack(side='left', fill='y', anchor='e') vscrollbar.config(command=self.results_box.yview) hscrollbar.pack(side='left', fill='x', expand=True, anchor='w') hscrollbar.config(command=self.results_box.xview) #there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e') i1.pack(side='top', fill='both', expand=True, anchor='n') i2.pack(side='bottom', fill='x', anchor='s') innerframe.pack(side='top', fill='both', expand=True) def _init_paging(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled') prev.pack(side='left', anchor='center') self.next = next = Button(innerframe, text='Next', command=self.__next__, width='10', borderwidth=1, highlightthickness=1, state='disabled') next.pack(side='right', anchor='center') innerframe.pack(side='top', fill='y') self.reset_current_page() def reset_current_page(self): self.current_page = -1 def _poll(self): try: event = self.queue.get(block=False) except q.Empty: pass else: if event == CORPUS_LOADED_EVENT: self.handle_corpus_loaded(event) elif event == ERROR_LOADING_CORPUS_EVENT: self.handle_error_loading_corpus(event) self.after = self.top.after(POLL_INTERVAL, self._poll) def handle_error_loading_corpus(self, event): self.status['text'] = 'Error in loading ' + self.var.get() self.unfreeze_editable() self.clear_results_box() self.freeze_editable() self.reset_current_page() def handle_corpus_loaded(self, event): self.status['text'] = self.var.get() + ' is loaded' self.unfreeze_editable() self.clear_results_box() self.reset_current_page() #self.next() collocations = self.model.next(self.current_page + 1) self.write_results(collocations) self.current_page += 1 def corpus_selected(self, *args): new_selection = self.var.get() self.load_corpus(new_selection) def previous(self): self.freeze_editable() collocations = self.model.prev(self.current_page - 1) self.current_page= self.current_page - 1 self.clear_results_box() self.write_results(collocations) self.unfreeze_editable() def __next__(self): self.freeze_editable() collocations = self.model.next(self.current_page + 1) self.clear_results_box() self.write_results(collocations) self.current_page += 1 self.unfreeze_editable() def load_corpus(self, selection): if self.model.selected_corpus != selection: self.status['text'] = 'Loading ' + selection + '...' self.freeze_editable() self.model.load_corpus(selection) def freeze_editable(self): self.prev['state'] = 'disabled' self.next['state'] = 'disabled' def clear_results_box(self): self.results_box['state'] = 'normal' self.results_box.delete("1.0", END) self.results_box['state'] = 'disabled' def fire_event(self, event): #Firing an event so that rendering of widgets happen in the mainloop thread self.top.event_generate(event, when='tail') def destroy(self, *e): if self.top is None: return self.top.after_cancel(self.after) self.top.destroy() self.top = None def mainloop(self, *args, **kwargs): if in_idle(): return self.top.mainloop(*args, **kwargs) def unfreeze_editable(self): self.set_paging_button_states() def set_paging_button_states(self): if self.current_page == -1 or self.current_page == 0: self.prev['state'] = 'disabled' else: self.prev['state'] = 'normal' if self.model.is_last_page(self.current_page): self.next['state'] = 'disabled' else: self.next['state'] = 'normal' def write_results(self, results): self.results_box['state'] = 'normal' row = 1 for each in results: self.results_box.insert(str(row) + '.0', each[0] + " " + each[1] + "\n") row += 1 self.results_box['state'] = 'disabled' class CollocationsModel: def __init__(self, queue): self.result_count = None self.selected_corpus = None self.collocations = None self.CORPORA = _CORPORA self.DEFAULT_CORPUS = _DEFAULT self.queue = queue self.reset_results() def reset_results(self): self.result_pages = [] self.results_returned = 0 def load_corpus(self, name): self.selected_corpus = name self.collocations = None runner_thread = self.LoadCorpus(name, self) runner_thread.start() self.reset_results() def non_default_corpora(self): copy = [] copy.extend(list(self.CORPORA.keys())) copy.remove(self.DEFAULT_CORPUS) copy.sort() return copy def is_last_page(self, number): if number < len(self.result_pages): return False return self.results_returned + (number - len(self.result_pages)) * self.result_count >= len(self.collocations) def next(self, page): if (len(self.result_pages) - 1) < page: for i in range(page - (len(self.result_pages) - 1)): self.result_pages.append(self.collocations[self.results_returned:self.results_returned+self.result_count]) self.results_returned += self.result_count return self.result_pages[page] def prev(self, page): if page == -1: return [] return self.result_pages[page] class LoadCorpus(threading.Thread): def __init__(self, name, model): threading.Thread.__init__(self) self.model, self.name = model, name def run(self): try: words = self.model.CORPORA[self.name]() from operator import itemgetter text = [w for w in words if len(w) > 2] fd = FreqDist(tuple(text[i:i+2]) for i in range(len(text)-1)) vocab = FreqDist(text) scored = [((w1,w2), fd[(w1,w2)] ** 3 / float(vocab[w1] * vocab[w2])) for w1, w2 in fd] scored.sort(key=itemgetter(1), reverse=True) self.model.collocations = list(map(itemgetter(0), scored)) self.model.queue.put(CORPUS_LOADED_EVENT) except Exception as e: print(e) self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) #def collocations(): # colloc_strings = [w1 + ' ' + w2 for w1, w2 in self._collocations[:num]] def app(): c = CollocationsView() c.mainloop() if __name__ == '__main__': app() __all__ = ['app'] nltk-3.1/nltk/app/concordance_app.py0000755000076500000240000005640412607224144017312 0ustar sbstaff00000000000000# Natural Language Toolkit: Concordance Application # # Copyright (C) 2001-2015 NLTK Project # Author: Sumukh Ghodke # URL: # For license information, see LICENSE.TXT import nltk.compat import re import threading if nltk.compat.PY3: import queue as q else: import Queue as q import tkinter.font from tkinter import (Tk, Button, END, Entry, Frame, IntVar, LEFT, Label, Menu, OptionMenu, SUNKEN, Scrollbar, StringVar, Text) from nltk.corpus import (cess_cat, brown, nps_chat, treebank, sinica_treebank, alpino, indian, floresta, mac_morpho, cess_esp) from nltk.util import in_idle from nltk.draw.util import ShowText WORD_OR_TAG = '[^/ ]+' BOUNDARY = r'\b' CORPUS_LOADED_EVENT = '<>' SEARCH_TERMINATED_EVENT = '<>' SEARCH_ERROR_EVENT = '<>' ERROR_LOADING_CORPUS_EVENT = '<>' POLL_INTERVAL = 50 # NB All corpora must be specified in a lambda expression so as not to be # loaded when the module is imported. _DEFAULT = 'English: Brown Corpus (Humor, simplified)' _CORPORA = { 'Catalan: CESS-CAT Corpus (simplified)': lambda: cess_cat.tagged_sents(tagset='universal'), 'English: Brown Corpus': lambda: brown.tagged_sents(), 'English: Brown Corpus (simplified)': lambda: brown.tagged_sents(tagset='universal'), 'English: Brown Corpus (Press, simplified)': lambda: brown.tagged_sents(categories=['news', 'editorial', 'reviews'], tagset='universal'), 'English: Brown Corpus (Religion, simplified)': lambda: brown.tagged_sents(categories='religion', tagset='universal'), 'English: Brown Corpus (Learned, simplified)': lambda: brown.tagged_sents(categories='learned', tagset='universal'), 'English: Brown Corpus (Science Fiction, simplified)': lambda: brown.tagged_sents(categories='science_fiction', tagset='universal'), 'English: Brown Corpus (Romance, simplified)': lambda: brown.tagged_sents(categories='romance', tagset='universal'), 'English: Brown Corpus (Humor, simplified)': lambda: brown.tagged_sents(categories='humor', tagset='universal'), 'English: NPS Chat Corpus': lambda: nps_chat.tagged_posts(), 'English: NPS Chat Corpus (simplified)': lambda: nps_chat.tagged_posts(tagset='universal'), 'English: Wall Street Journal Corpus': lambda: treebank.tagged_sents(), 'English: Wall Street Journal Corpus (simplified)': lambda: treebank.tagged_sents(tagset='universal'), 'Chinese: Sinica Corpus': lambda: sinica_treebank.tagged_sents(), 'Chinese: Sinica Corpus (simplified)': lambda: sinica_treebank.tagged_sents(tagset='universal'), 'Dutch: Alpino Corpus': lambda: alpino.tagged_sents(), 'Dutch: Alpino Corpus (simplified)': lambda: alpino.tagged_sents(tagset='universal'), 'Hindi: Indian Languages Corpus': lambda: indian.tagged_sents(files='hindi.pos'), 'Hindi: Indian Languages Corpus (simplified)': lambda: indian.tagged_sents(files='hindi.pos', tagset='universal'), 'Portuguese: Floresta Corpus (Portugal)': lambda: floresta.tagged_sents(), 'Portuguese: Floresta Corpus (Portugal, simplified)': lambda: floresta.tagged_sents(tagset='universal'), 'Portuguese: MAC-MORPHO Corpus (Brazil)': lambda: mac_morpho.tagged_sents(), 'Portuguese: MAC-MORPHO Corpus (Brazil, simplified)': lambda: mac_morpho.tagged_sents(tagset='universal'), 'Spanish: CESS-ESP Corpus (simplified)': lambda: cess_esp.tagged_sents(tagset='universal'), } class ConcordanceSearchView(object): _BACKGROUND_COLOUR='#FFF' #white #Colour of highlighted results _HIGHLIGHT_WORD_COLOUR='#F00' #red _HIGHLIGHT_WORD_TAG='HL_WRD_TAG' _HIGHLIGHT_LABEL_COLOUR='#C0C0C0' # dark grey _HIGHLIGHT_LABEL_TAG='HL_LBL_TAG' #Percentage of text left of the scrollbar position _FRACTION_LEFT_TEXT=0.30 def __init__(self): self.queue = q.Queue() self.model = ConcordanceSearchModel(self.queue) self.top = Tk() self._init_top(self.top) self._init_menubar() self._init_widgets(self.top) self.load_corpus(self.model.DEFAULT_CORPUS) self.after = self.top.after(POLL_INTERVAL, self._poll) def _init_top(self, top): top.geometry('950x680+50+50') top.title('NLTK Concordance Search') top.bind('', self.destroy) top.protocol('WM_DELETE_WINDOW', self.destroy) top.minsize(950,680) def _init_widgets(self, parent): self.main_frame = Frame(parent, dict(background=self._BACKGROUND_COLOUR, padx=1, pady=1, border=1)) self._init_corpus_select(self.main_frame) self._init_query_box(self.main_frame) self._init_results_box(self.main_frame) self._init_paging(self.main_frame) self._init_status(self.main_frame) self.main_frame.pack(fill='both', expand=True) def _init_menubar(self): self._result_size = IntVar(self.top) self._cntx_bf_len = IntVar(self.top) self._cntx_af_len = IntVar(self.top) menubar = Menu(self.top) filemenu = Menu(menubar, tearoff=0, borderwidth=0) filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-q') menubar.add_cascade(label='File', underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) rescntmenu = Menu(editmenu, tearoff=0) rescntmenu.add_radiobutton(label='20', variable=self._result_size, underline=0, value=20, command=self.set_result_size) rescntmenu.add_radiobutton(label='50', variable=self._result_size, underline=0, value=50, command=self.set_result_size) rescntmenu.add_radiobutton(label='100', variable=self._result_size, underline=0, value=100, command=self.set_result_size) rescntmenu.invoke(1) editmenu.add_cascade(label='Result Count', underline=0, menu=rescntmenu) cntxmenu = Menu(editmenu, tearoff=0) cntxbfmenu = Menu(cntxmenu, tearoff=0) cntxbfmenu.add_radiobutton(label='60 characters', variable=self._cntx_bf_len, underline=0, value=60, command=self.set_cntx_bf_len) cntxbfmenu.add_radiobutton(label='80 characters', variable=self._cntx_bf_len, underline=0, value=80, command=self.set_cntx_bf_len) cntxbfmenu.add_radiobutton(label='100 characters', variable=self._cntx_bf_len, underline=0, value=100, command=self.set_cntx_bf_len) cntxbfmenu.invoke(1) cntxmenu.add_cascade(label='Before', underline=0, menu=cntxbfmenu) cntxafmenu = Menu(cntxmenu, tearoff=0) cntxafmenu.add_radiobutton(label='70 characters', variable=self._cntx_af_len, underline=0, value=70, command=self.set_cntx_af_len) cntxafmenu.add_radiobutton(label='90 characters', variable=self._cntx_af_len, underline=0, value=90, command=self.set_cntx_af_len) cntxafmenu.add_radiobutton(label='110 characters', variable=self._cntx_af_len, underline=0, value=110, command=self.set_cntx_af_len) cntxafmenu.invoke(1) cntxmenu.add_cascade(label='After', underline=0, menu=cntxafmenu) editmenu.add_cascade(label='Context', underline=0, menu=cntxmenu) menubar.add_cascade(label='Edit', underline=0, menu=editmenu) self.top.config(menu=menubar) def set_result_size(self, **kwargs): self.model.result_count = self._result_size.get() def set_cntx_af_len(self, **kwargs): self._char_after = self._cntx_af_len.get() def set_cntx_bf_len(self, **kwargs): self._char_before = self._cntx_bf_len.get() def _init_corpus_select(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.var = StringVar(innerframe) self.var.set(self.model.DEFAULT_CORPUS) Label(innerframe, justify=LEFT, text=' Corpus: ', background=self._BACKGROUND_COLOUR, padx = 2, pady = 1, border = 0).pack(side='left') other_corpora = list(self.model.CORPORA.keys()).remove(self.model.DEFAULT_CORPUS) om = OptionMenu(innerframe, self.var, self.model.DEFAULT_CORPUS, command=self.corpus_selected, *self.model.non_default_corpora()) om['borderwidth'] = 0 om['highlightthickness'] = 1 om.pack(side='left') innerframe.pack(side='top', fill='x', anchor='n') def _init_status(self, parent): self.status = Label(parent, justify=LEFT, relief=SUNKEN, background=self._BACKGROUND_COLOUR, border=0, padx = 1, pady = 0) self.status.pack(side='top', anchor='sw') def _init_query_box(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) another = Frame(innerframe, background=self._BACKGROUND_COLOUR) self.query_box = Entry(another, width=60) self.query_box.pack(side='left', fill='x', pady=25, anchor='center') self.search_button = Button(another, text='Search', command=self.search, borderwidth=1, highlightthickness=1) self.search_button.pack(side='left', fill='x', pady=25, anchor='center') self.query_box.bind('', self.search_enter_keypress_handler) another.pack() innerframe.pack(side='top', fill='x', anchor='n') def search_enter_keypress_handler(self, *event): self.search() def _init_results_box(self, parent): innerframe = Frame(parent) i1 = Frame(innerframe) i2 = Frame(innerframe) vscrollbar = Scrollbar(i1, borderwidth=1) hscrollbar = Scrollbar(i2, borderwidth=1, orient='horiz') self.results_box = Text(i1, font=tkinter.font.Font(family='courier', size='16'), state='disabled', borderwidth=1, yscrollcommand=vscrollbar.set, xscrollcommand=hscrollbar.set, wrap='none', width='40', height = '20', exportselection=1) self.results_box.pack(side='left', fill='both', expand=True) self.results_box.tag_config(self._HIGHLIGHT_WORD_TAG, foreground=self._HIGHLIGHT_WORD_COLOUR) self.results_box.tag_config(self._HIGHLIGHT_LABEL_TAG, foreground=self._HIGHLIGHT_LABEL_COLOUR) vscrollbar.pack(side='left', fill='y', anchor='e') vscrollbar.config(command=self.results_box.yview) hscrollbar.pack(side='left', fill='x', expand=True, anchor='w') hscrollbar.config(command=self.results_box.xview) #there is no other way of avoiding the overlap of scrollbars while using pack layout manager!!! Label(i2, text=' ', background=self._BACKGROUND_COLOUR).pack(side='left', anchor='e') i1.pack(side='top', fill='both', expand=True, anchor='n') i2.pack(side='bottom', fill='x', anchor='s') innerframe.pack(side='top', fill='both', expand=True) def _init_paging(self, parent): innerframe = Frame(parent, background=self._BACKGROUND_COLOUR) self.prev = prev = Button(innerframe, text='Previous', command=self.previous, width='10', borderwidth=1, highlightthickness=1, state='disabled') prev.pack(side='left', anchor='center') self.next = next = Button(innerframe, text='Next', command=self.__next__, width='10', borderwidth=1, highlightthickness=1, state='disabled') next.pack(side='right', anchor='center') innerframe.pack(side='top', fill='y') self.current_page = 0 def previous(self): self.clear_results_box() self.freeze_editable() self.model.prev(self.current_page - 1) def __next__(self): self.clear_results_box() self.freeze_editable() self.model.next(self.current_page + 1) def about(self, *e): ABOUT = ("NLTK Concordance Search Demo\n") TITLE = 'About: NLTK Concordance Search Demo' try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE, parent=self.main_frame).show() except: ShowText(self.top, TITLE, ABOUT) def _bind_event_handlers(self): self.top.bind(CORPUS_LOADED_EVENT, self.handle_corpus_loaded) self.top.bind(SEARCH_TERMINATED_EVENT, self.handle_search_terminated) self.top.bind(SEARCH_ERROR_EVENT, self.handle_search_error) self.top.bind(ERROR_LOADING_CORPUS_EVENT, self.handle_error_loading_corpus) def _poll(self): try: event = self.queue.get(block=False) except q.Empty: pass else: if event == CORPUS_LOADED_EVENT: self.handle_corpus_loaded(event) elif event == SEARCH_TERMINATED_EVENT: self.handle_search_terminated(event) elif event == SEARCH_ERROR_EVENT: self.handle_search_error(event) elif event == ERROR_LOADING_CORPUS_EVENT: self.handle_error_loading_corpus(event) self.after = self.top.after(POLL_INTERVAL, self._poll) def handle_error_loading_corpus(self, event): self.status['text'] = 'Error in loading ' + self.var.get() self.unfreeze_editable() self.clear_all() self.freeze_editable() def handle_corpus_loaded(self, event): self.status['text'] = self.var.get() + ' is loaded' self.unfreeze_editable() self.clear_all() self.query_box.focus_set() def handle_search_terminated(self, event): #todo: refactor the model such that it is less state sensitive results = self.model.get_results() self.write_results(results) self.status['text'] = '' if len(results) == 0: self.status['text'] = 'No results found for ' + self.model.query else: self.current_page = self.model.last_requested_page self.unfreeze_editable() self.results_box.xview_moveto(self._FRACTION_LEFT_TEXT) def handle_search_error(self, event): self.status['text'] = 'Error in query ' + self.model.query self.unfreeze_editable() def corpus_selected(self, *args): new_selection = self.var.get() self.load_corpus(new_selection) def load_corpus(self, selection): if self.model.selected_corpus != selection: self.status['text'] = 'Loading ' + selection + '...' self.freeze_editable() self.model.load_corpus(selection) def search(self): self.current_page = 0 self.clear_results_box() self.model.reset_results() query = self.query_box.get() if (len(query.strip()) == 0): return self.status['text'] = 'Searching for ' + query self.freeze_editable() self.model.search(query, self.current_page + 1, ) def write_results(self, results): self.results_box['state'] = 'normal' row = 1 for each in results: sent, pos1, pos2 = each[0].strip(), each[1], each[2] if len(sent) != 0: if (pos1 < self._char_before): sent, pos1, pos2 = self.pad(sent, pos1, pos2) sentence = sent[pos1-self._char_before:pos1+self._char_after] if not row == len(results): sentence += '\n' self.results_box.insert(str(row) + '.0', sentence) word_markers, label_markers = self.words_and_labels(sent, pos1, pos2) for marker in word_markers: self.results_box.tag_add(self._HIGHLIGHT_WORD_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1])) for marker in label_markers: self.results_box.tag_add(self._HIGHLIGHT_LABEL_TAG, str(row) + '.' + str(marker[0]), str(row) + '.' + str(marker[1])) row += 1 self.results_box['state'] = 'disabled' def words_and_labels(self, sentence, pos1, pos2): search_exp = sentence[pos1:pos2] words, labels = [], [] labeled_words = search_exp.split(' ') index = 0 for each in labeled_words: if each == '': index += 1 else: word, label = each.split('/') words.append((self._char_before + index, self._char_before + index + len(word))) index += len(word) + 1 labels.append((self._char_before + index, self._char_before + index + len(label))) index += len(label) index += 1 return words, labels def pad(self, sent, hstart, hend): if hstart >= self._char_before: return sent, hstart, hend d = self._char_before - hstart sent = ''.join([' '] * d) + sent return sent, hstart + d, hend + d def destroy(self, *e): if self.top is None: return self.top.after_cancel(self.after) self.top.destroy() self.top = None def clear_all(self): self.query_box.delete(0, END) self.model.reset_query() self.clear_results_box() def clear_results_box(self): self.results_box['state'] = 'normal' self.results_box.delete("1.0", END) self.results_box['state'] = 'disabled' def freeze_editable(self): self.query_box['state'] = 'disabled' self.search_button['state'] = 'disabled' self.prev['state'] = 'disabled' self.next['state'] = 'disabled' def unfreeze_editable(self): self.query_box['state'] = 'normal' self.search_button['state'] = 'normal' self.set_paging_button_states() def set_paging_button_states(self): if self.current_page == 0 or self.current_page == 1: self.prev['state'] = 'disabled' else: self.prev['state'] = 'normal' if self.model.has_more_pages(self.current_page): self.next['state'] = 'normal' else: self.next['state'] = 'disabled' def fire_event(self, event): #Firing an event so that rendering of widgets happen in the mainloop thread self.top.event_generate(event, when='tail') def mainloop(self, *args, **kwargs): if in_idle(): return self.top.mainloop(*args, **kwargs) class ConcordanceSearchModel(object): def __init__(self, queue): self.queue = queue self.CORPORA = _CORPORA self.DEFAULT_CORPUS = _DEFAULT self.selected_corpus = None self.reset_query() self.reset_results() self.result_count = None self.last_sent_searched = 0 def non_default_corpora(self): copy = [] copy.extend(list(self.CORPORA.keys())) copy.remove(self.DEFAULT_CORPUS) copy.sort() return copy def load_corpus(self, name): self.selected_corpus = name self.tagged_sents = [] runner_thread = self.LoadCorpus(name, self) runner_thread.start() def search(self, query, page): self.query = query self.last_requested_page = page self.SearchCorpus(self, page, self.result_count).start() def next(self, page): self.last_requested_page = page if len(self.results) < page: self.search(self.query, page) else: self.queue.put(SEARCH_TERMINATED_EVENT) def prev(self, page): self.last_requested_page = page self.queue.put(SEARCH_TERMINATED_EVENT) def reset_results(self): self.last_sent_searched = 0 self.results = [] self.last_page = None def reset_query(self): self.query = None def set_results(self, page, resultset): self.results.insert(page - 1, resultset) def get_results(self): return self.results[self.last_requested_page - 1] def has_more_pages(self, page): if self.results == [] or self.results[0] == []: return False if self.last_page is None: return True return page < self.last_page class LoadCorpus(threading.Thread): def __init__(self, name, model): threading.Thread.__init__(self) self.model, self.name = model, name def run(self): try: ts = self.model.CORPORA[self.name]() self.model.tagged_sents = [' '.join(w+'/'+t for (w,t) in sent) for sent in ts] self.model.queue.put(CORPUS_LOADED_EVENT) except Exception as e: print(e) self.model.queue.put(ERROR_LOADING_CORPUS_EVENT) class SearchCorpus(threading.Thread): def __init__(self, model, page, count): self.model, self.count, self.page = model, count, page threading.Thread.__init__(self) def run(self): q = self.processed_query() sent_pos, i, sent_count = [], 0, 0 for sent in self.model.tagged_sents[self.model.last_sent_searched:]: try: m = re.search(q, sent) except re.error: self.model.reset_results() self.model.queue.put(SEARCH_ERROR_EVENT) return if m: sent_pos.append((sent, m.start(), m.end())) i += 1 if i > self.count: self.model.last_sent_searched += sent_count - 1 break sent_count += 1 if (self.count >= len(sent_pos)): self.model.last_sent_searched += sent_count - 1 self.model.last_page = self.page self.model.set_results(self.page, sent_pos) else: self.model.set_results(self.page, sent_pos[:-1]) self.model.queue.put(SEARCH_TERMINATED_EVENT) def processed_query(self): new = [] for term in self.model.query.split(): term = re.sub(r'\.', r'[^/ ]', term) if re.match('[A-Z]+$', term): new.append(BOUNDARY + WORD_OR_TAG + '/' + term + BOUNDARY) elif '/' in term: new.append(BOUNDARY + term + BOUNDARY) else: new.append(BOUNDARY + term + '/' + WORD_OR_TAG + BOUNDARY) return ' '.join(new) def app(): d = ConcordanceSearchView() d.mainloop() if __name__ == '__main__': app() __all__ = ['app'] nltk-3.1/nltk/app/nemo_app.py0000755000076500000240000002743312574600335015775 0ustar sbstaff00000000000000# Finding (and Replacing) Nemo, Version 1.1, Aristide Grange 2006/06/06 # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/496783 """ Finding (and Replacing) Nemo Instant Regular Expressions Created by Aristide Grange """ import nltk.compat import tkinter as tk import re import itertools windowTitle = "Finding (and Replacing) Nemo" initialFind = r"n(.*?)e(.*?)m(.*?)o" initialRepl = r"M\1A\2K\3I" initialText = """\ Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. """ images = { "FIND":"R0lGODlhMAAiAPcAMf/////37//35//n1v97Off///f/9/f37/fexvfOvfeEQvd7QvdrQvdrKfdaKfdSMfdSIe/v9+/v7+/v5+/n3u/e1u/Wxu/Gre+1lO+tnO+thO+Ua+97Y+97Oe97Me9rOe9rMe9jOe9jMe9jIe9aMefe5+fe3ufezuece+eEWudzQudaIedSIedKMedKIedCKedCId7e1t7Wzt7Oxt7Gvd69vd69rd61pd6ljN6UjN6Ue96EY95zY95rUt5rQt5jMd5SId5KIdbn59be3tbGztbGvda1rdaEa9Z7a9Z7WtZzQtZzOdZzMdZjMdZaQtZSOdZSMdZKMdZCKdZCGNY5Ic7W1s7Oxs7Gtc69xs69tc69rc6tpc6llM6clM6cjM6Ue86EY85zWs5rSs5SKc5KKc5KGMa1tcatrcalvcalnMaUpcZ7c8ZzMcZrUsZrOcZrMcZaQsZSOcZSMcZKMcZCKcZCGMYxIcYxGL3Gxr21tb21rb2lpb2crb2cjL2UnL2UlL2UhL2Ec717Wr17Ur1zWr1rMb1jUr1KMb1KIb1CIb0xGLWlrbWlpbWcnLWEe7V7c7VzY7VzUrVSKbVKMbVCMbVCIbU5KbUxIbUxEK2lta2lpa2clK2UjK2MnK2MlK2Ea617e61za61rY61rMa1jSq1aUq1aSq1SQq1KKa0xEKWlnKWcnKWUnKWUhKWMjKWEa6Vza6VrWqVjMaVaUqVaKaVSMaVCMaU5KaUxIaUxGJyclJyMe5yElJyEhJx7e5x7c5xrOZxaQpxSOZxKQpw5IZSMhJSEjJR7c5Rre5RrY5RrUpRSQpRSKZRCOZRCKZQxKZQxIYyEhIx7hIxza4xzY4xrc4xjUoxaa4xaUoxSSoxKQoxCMYw5GIR7c4Rzc4Rre4RjY4RjWoRaa4RSWoRSUoRSMYRKQoRCOYQ5KYQxIXtra3taY3taSntKOXtCMXtCKXNCMXM5MXMxIWtSUmtKSmtKQmtCOWs5MWs5KWs5IWNCKWMxIVIxKUIQCDkhGAAAACH+AS4ALAAAAAAwACIAAAj/AAEIHEiwoMGDCBMqXMiwoUOHMqxIeEiRoZVp7cpZ29WrF4WKIAd208dGAQEVbiTVChUjZMU9+pYQmPmBZpxgvVw+nDdKwQICNVcIXQEkTgKdDdUJ+/nggVAXK1xI3TEA6UIr2uJ8iBqka1cXXTlkqGoVYRZ7iLyqBSs0iiEtZQVKiDGxBI1u3NR6lUpGDKg8MSgEQCphU7Z22vhg0dILXRCpYLuSCcYJT4wqXASBQaBzU7klHxC127OHD7ZDJFpERqRt0x5OnwQpmZmCLEhrbgg4WIHO1RY+nbQ9WRGEDJlmnXwJ+9FBgXMCIzYMVijBBgYMFxIMqJBMSc0Ht7qh/+Gjpte2rnYsYeNlasWIBgQ6yCewIoPCCp/cyP/wgUGbXVu0QcADZNBDnh98gHMLGXYQUw02w61QU3wdbNWDbQVVIIhMMwFF1DaZiPLBAy7E04kafrjSizaK3LFNNc0AAYRQDsAHHQlJ2IDQJ2zE1+EKDjiAijShkECCC8Qgw4cr7ZgyzC2WaHPNLWWoNeNWPiRAw0QFWQFMhz8C+QQ20yAiVSrY+MGOJCsccsst2GCzoHFxxEGGC+8hgs0MB2kyCpgzrUDCbs1Es41UdtATHFFkWELMOtsoQsYcgvRRQw5RSDgGOjZMR1AvPQIq6KCo9AKOJWDd48owQlHR4DXEKP9iyRrK+DNNBTu4RwIPFeTAGUG7hAomkA84gEg1m6ADljy9PBKGGJY4ig0xlsTBRSn98FOFDUC8pwQOPkgHbCGAzhTkA850s0c7j6Hjix9+gBIrMXLeAccWXUCyiRBcBEECdEJ98KtAqtBCYQc/OvDENnl4gYpUxISCIjjzylkGGV9okYUVNogRhAOBuuAEhjG08wOgDYzAgA5bCjIoCe5uwUk80RKTTSppPREGGGCIISOQ9AXBg6cC6WIywvCpoMHAocRBwhP4bHLFLujYkV42xNxBRhAyGrc113EgYtRBerDDDHMoDCyQEL5sE083EkgwQyBhxGFHMM206DUixGxmE0wssbQjCQ4JCaFKFwgQTVAVVhQUwAVPIFJKrHfYYRwi6OCDzzuIJIFhXAD0EccPsYRiSyqKSDpFcWSMIcZRoBMkQyA2BGZDIKSYcggih8TRRg4VxM5QABVYYLxgwiev/PLMCxQQADs=", "find":"R0lGODlhMAAiAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OSkpKRgYGAAAAAAAAAAAAAAAAAAAACH+AS4ALAAAAAAwACIAAAX/ICCOZGmeaKquY2AGLiuvMCAUBuHWc48Kh0iFInEYCb4kSQCxPBiMxkMigRQEgJiSFVBYHNGG0RiZOHjblWAiiY4fkDhEYoBp06dAWfyAQyKAgAwDaHgnB0RwgYASgQ0IhDuGJDAIFhMRVFSLEX8QCJJ4AQM5AgQHTZqqjBAOCQQEkWkCDRMUFQsICQ4Vm5maEwwHOAsPDTpKMAsUDlO4CssTcb+2DAp8YGCyNFoCEsZwFQ3QDRTTVBRS0g1QbgsCd5QAAwgIBwYFAwStzQ8UEdCKVchky0yVBw7YuXkAKt4IAg74vXHVagqFBRgXSCAyYWAVCH0SNhDTitCJfSL5/4RbAPKPhQYYjVCYYAvCP0BxEDaD8CheAAHNwqh8MMGPSwgLeJWhwHSjqkYI+xg4MMCEgQjtRvZ7UAYCpghMF7CxONOWJkYR+rCpY4JlVpVxKDwYWEactKW9mhYRtqCTgwgWEMArERSK1j5q//6T8KXonFsShpiJkAECgQYVjykooCVA0JGHEWNiYCHThTFeb3UkoiCCBgwGEKQ1kuAJlhFwhA71h5SukwUM5qqeCSGBgicEWkfNiWSERtBad4JNIBaQBaQah1ToyGZBAnsIuIJs1qnqiAIVjIE2gnAB1T5x0icgzXT79ipgMOOEH6HBbREBMJCeGEY08IoLAkzB1YYFwjxwSUGSNULQJnNUwRYlCcyEkALIxECAP9cNMMABYpRhy3ZsSLDaR70oUAiABGCkAxowCGCAAfDYIQACXoElGRsdXWDBdg2Y90IWktDYGYAB9PWHP0PMdFZaF07SQgAFNDAMAQg0QA1UC8xoZQl22JGFPgWkOUCOL1pZQyhjxinnnCWEAAA7", "REPL":"R0lGODlhMAAjAPcAMf/////3//+lOf+UKf+MEPf///f39/f35/fv7/ecQvecOfecKfeUIfeUGPeUEPeUCPeMAO/37+/v9+/v3u/n3u/n1u+9jO+9c++1hO+ta++tY++tWu+tUu+tSu+lUu+lQu+lMe+UMe+UKe+UGO+UEO+UAO+MCOfv5+fvxufn7+fn5+fnzue9lOe9c+e1jOe1e+e1c+e1a+etWuetUuelQuecOeeUUueUCN7e597e3t7e1t7ezt7evd7Wzt7Oxt7Ovd7Otd7Opd7OnN7Gtd7Gpd69lN61hN6ta96lStbextberdbW3tbWztbWxtbOvdbOrda1hNalUtaECM7W1s7Ozs7Oxs7Otc7Gxs7Gvc69tc69rc69pc61jM6lc8bWlMbOvcbGxsbGpca9tca9pca1nMaMAL3OhL3Gtb21vb21tb2tpb2tnL2tlLW9tbW9pbW9e7W1pbWtjLWcKa21nK2tra2tnK2tlK2lpa2llK2ljK2le6WlnKWljKWUe6WUc6WUY5y1QpyclJycjJychJyUc5yMY5StY5SUe5SMhJSMe5SMc5SMWpSEa5SESoyUe4yMhIyEY4SlKYScWoSMe4SEe4SEa4R7c4R7Y3uMY3uEe3t7e3t7c3tza3tzY3trKXtjIXOcAHOUMXOEY3Nzc3NzWnNrSmulCGuUMWuMGGtzWmtrY2taMWtaGGOUOWOMAGNzUmNjWmNjSmNaUmNaQmNaOWNaIWNSCFqcAFpjUlpSMVpSIVpSEFpKKVKMAFJSUlJSSlJSMVJKMVJKGFJKAFI5CEqUAEqEAEpzQkpKIUpCQkpCGEpCAEo5EEoxAEJjOUJCOUJCAEI5IUIxADl7ADlaITlCOTkxMTkxKTkxEDkhADFzADFrGDE5OTExADEpEClrCCkxKSkpKSkpISkpACkhCCkhACkYACFzACFrACEhCCEYGBhjEBhjABghABgYCBgYABgQEBgQABAQABAIAAhjAAhSAAhKAAgIEAgICABaAABCAAAhAAAQAAAIAAAAAAAAACH+AS4ALAAAAAAwACMAAAj/AAEIHEiwoMGDCBMqXMiwocOHAA4cgEixIIIJO3JMmAjADIqKFU/8MHIkg5EgYXx4iaTkI0iHE6wE2TCggYILQayEAgXIy8uGCKz8sDCAQAMRG3iEcXULlJkJPwli3OFjh9UdYYLE6NBhA04UXHoVA2XoTZgfPKBWlOBDphAWOdfMcfMDLloeO3hIMjbWVCQ5Fn6E2UFxgpsgFjYIEBADrZU6luqEEfqjTqpt54z1uuWqTIcgWAk7PECGzIUQDRosDmxlUrVJkwQJkqVuX71v06YZcyUlROAdbnLAJKPFyAYFAhoMwFlnEh0rWkpz8raPHm7dqKKc/KFFkBUrVn1M/ziBcEIeLUEQI8/AYk0i9Be4sqjsrN66c9/OnbobhpR3HkIUoZ0WVnBE0AGLFKKFD0HAFUQe77HQgQI1hRBDEHMcY0899bBzihZuCPILJD8EccEGGzwAQhFaUHHQH82sUkgeNHISDBk8WCCCcsqFUEQWmOyzjz3sUGNNOO5Y48YOEgowAAQhnBScQV00k82V47jzjy9CXZBcjziFoco//4CDiSOyhPMPLkJZkEBqJmRQxA9uZGEQD8Ncmc044/zzDF2IZQBCCDYE8QMZz/iiCSx0neHGI7BIhhhNn+1gxRpokEcQAp7seWU7/PwTyxqG/iCEEVzQmUombnDRxRExzP9nBR2PCKLFD3UJwcMPa/SRqUGNWJmNOVn+M44ukMRB4KGcWDNLVhuUMEIJAlzwA3DJBHMJIXm4sQYhqyxCRQQGLSIsn1qac2UzysQSyzX/hLMGD0F0IMCODYAQBA9W/PKPOcRiw0wzwxTiokF9dLMnuv/Mo+fCZF7jBr0xbDDCACWEYKgb1vzjDp/jZNOMLX0IZxAKq2TZTjtaOjwOsXyG+s8sZJTIQsUdIGHoJPf8w487QI/TDSt5mGwQFZxc406o8HiDJchk/ltLHpSlJwSvz5DpTjvmuGNOM57koelBOaAhiCaaPBLL0wwbm003peRBnBZqJMJL1ECz/HXYYx/NdAIOOVCxQyLorswymU93o0wuwfAiTDNR/xz0MLXU0XdCE+UwSTRZAq2lsSATu+4wkGvt+TjNzPLrQyegAUku2Hij5cd8LhxyM8QIg4w18HgcdC6BTBFSDmfQqsovttveDcG7lFLHI75cE841sARCxeWsnxC4G9HADPK6ywzDCRqBo0EHHWhMgT1IJzziNci1N7PMKnSYfML96/90AiJKey/0KtbLX1QK0rrNnQ541xugQ7SHhkXBghN0SKACWRc4KlAhBwKcIOYymJCAAAA7", "repl":"R0lGODlhMAAjAPQAMf////f39+/v7+fn597e3tbW1s7OzsbGxr29vbW1ta2traWlpZycnJSUlIyMjISEhHt7e3Nzc2tra2NjY1paWlJSUkpKSkJCQjk5OTExMSkpKSEhIRgYGBAQEAgICAAAACH+AS4ALAAAAAAwACMAAAX/ICCOZGmeaKqubOu+gCDANBkIQ1EMQhAghFptYEAkEgjEwXBo7ISvweGgWCwUysPjwTgEoCafTySYIhYMxgLBjEQgCULvCw0QdAZdoVhUIJUFChISEAxYeQM1N1OMTAp+UwZ5eA4TEhFbDWYFdC4ECVMJjwl5BwsQa0umEhUVlhESDgqlBp0rAn5nVpBMDxeZDRQbHBgWFBSWDgtLBnFjKwRYCI9VqQsPs0YKEcMXFq0UEalFDWx4BAO2IwPjppAKDkrTWKYUGd7fEJJFEZpM00cOzCgh4EE8SaoWxKNixQooBRMyZMBwAYIRBhUgLDGS4MoBJeoANMhAgQsaCRZm/5lqaCUJhA4cNHjDoKEDBlJUHqkBlYBTiQUZNGjYMMxDhY3VWk6R4MEDBoMUak5AqoYBqANIBo4wcGGDUKIeLlzVZmWJggsVIkwAZaQSA3kdZzlKkIiEAAlDvW5oOkEBs488JTw44oeUIwdvVTFTUK7uiAAPgubt8GFDhQepqETAQCFU1UMGzlqAgFhUsAcCS0AO6lUDhw8xNRSbENGDhgWSHjWUe6ACbKITizmopZoBa6KvOwj9uuHDhwxyj3xekgDDhw5EvWKo0IB4iQLCOCC/njc7ZQ8UeGvza+ABZZgcxJNc4FO1gc0cOsCUrHevc8tdIMTIAhc4F198G2Qwwd8CBIQUAwEINABBBJUwR9R5wElgVRLwWODBBx4cGB8GEzDQIAo33CGJA8gh+JoH/clUgQU0YvDhdfmJdwEFC6Sjgg8yEPAABsPkh2F22cl2AQbn6QdTghTQ5eAJAQyQAAQV0MSBB9gRVZ4GE1mw5JZOAmiAVi1UWcAZDrDyZXYTeaOhA/bIVuIBPtKQ4h7ViYekUPdcEAEbzTzCRp5CADmAAwj+ORGPBcgwAAHo9ABGCYtm0ChwFHShlRiXhmHlkAcCiOeUodqQw5W0oXLAiamy4MOkjOyAaqxUymApDCEAADs=", } colors = ["#FF7B39","#80F121"] emphColors = ["#DAFC33","#F42548"] fieldParams = { "height":3, "width":70, "font":("monaco",14), "highlightthickness":0, "borderwidth":0, "background":"white", } textParams = { "bg":"#F7E0D4", "fg":"#2321F1", "highlightthickness":0, "width":1, "height":10, "font":("verdana",16), "wrap":"word", } class Zone: def __init__(self, image, initialField, initialText): frm = tk.Frame(root) frm.config(background="white") self.image = tk.PhotoImage(format='gif',data=images[image.upper()]) self.imageDimmed = tk.PhotoImage(format='gif',data=images[image]) self.img = tk.Label(frm) self.img.config(borderwidth=0) self.img.pack(side = "left") self.fld = tk.Text(frm, **fieldParams) self.initScrollText(frm,self.fld,initialField) frm = tk.Frame(root) self.txt = tk.Text(frm, **textParams) self.initScrollText(frm,self.txt,initialText) for i in range(2): self.txt.tag_config(colors[i], background = colors[i]) self.txt.tag_config("emph"+colors[i], foreground = emphColors[i]) def initScrollText(self,frm,txt,contents): scl = tk.Scrollbar(frm) scl.config(command = txt.yview) scl.pack(side="right",fill="y") txt.pack(side = "left", expand=True, fill="x") txt.config(yscrollcommand = scl.set) txt.insert("1.0",contents) frm.pack(fill = "x") tk.Frame(height=2, bd=1, relief="ridge").pack(fill="x") def refresh(self): self.colorCycle = itertools.cycle(colors) try: self.substitute() self.img.config(image = self.image) except re.error: self.img.config(image = self.imageDimmed) class FindZone(Zone): def addTags(self,m): color = next(self.colorCycle) self.txt.tag_add(color,"1.0+%sc"%m.start(),"1.0+%sc"%m.end()) try: self.txt.tag_add("emph"+color,"1.0+%sc"%m.start("emph"), "1.0+%sc"%m.end("emph")) except: pass def substitute(self,*args): for color in colors: self.txt.tag_remove(color,"1.0","end") self.txt.tag_remove("emph"+color,"1.0","end") self.rex = re.compile("") # default value in case of misformed regexp self.rex = re.compile(self.fld.get("1.0","end")[:-1],re.MULTILINE) try: re.compile("(?P%s)" % self.fld.get(tk.SEL_FIRST, tk.SEL_LAST)) self.rexSel = re.compile("%s(?P%s)%s" % ( self.fld.get("1.0",tk.SEL_FIRST), self.fld.get(tk.SEL_FIRST,tk.SEL_LAST), self.fld.get(tk.SEL_LAST,"end")[:-1], ),re.MULTILINE) except: self.rexSel = self.rex self.rexSel.sub(self.addTags,self.txt.get("1.0","end")) class ReplaceZone(Zone): def addTags(self,m): s = sz.rex.sub(self.repl,m.group()) self.txt.delete("1.0+%sc"%(m.start()+self.diff), "1.0+%sc"%(m.end()+self.diff)) self.txt.insert("1.0+%sc"%(m.start()+self.diff),s, next(self.colorCycle)) self.diff += len(s) - (m.end() - m.start()) def substitute(self): self.txt.delete("1.0","end") self.txt.insert("1.0",sz.txt.get("1.0","end")[:-1]) self.diff = 0 self.repl = rex0.sub(r"\\g<\1>",self.fld.get("1.0","end")[:-1]) sz.rex.sub(self.addTags,sz.txt.get("1.0","end")[:-1]) def launchRefresh(_): sz.fld.after_idle(sz.refresh) rz.fld.after_idle(rz.refresh) def app(): global root, sz, rz, rex0 root = tk.Tk() root.resizable(height=False,width=True) root.title(windowTitle) root.minsize(width=250,height=0) sz = FindZone("find",initialFind,initialText) sz.fld.bind("",launchRefresh) sz.fld.bind("",launchRefresh) sz.fld.bind("",launchRefresh) sz.rexSel = re.compile("") rz = ReplaceZone("repl",initialRepl,"") rex0 = re.compile(r"(?",launchRefresh) launchRefresh(None) root.mainloop() if __name__ == '__main__': app() __all__ = ['app'] nltk-3.1/nltk/app/rdparser_app.py0000644000076500000240000010646412607224144016655 0ustar sbstaff00000000000000# Natural Language Toolkit: Recursive Descent Parser Application # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring the recursive descent parser. The recursive descent parser maintains a tree, which records the structure of the portion of the text that has been parsed. It uses CFG productions to expand the fringe of the tree, and matches its leaves against the text. Initially, the tree contains the start symbol ("S"). It is shown in the main canvas, to the right of the list of available expansions. The parser builds up a tree structure for the text using three operations: - "expand" uses a CFG production to add children to a node on the fringe of the tree. - "match" compares a leaf in the tree to a text token. - "backtrack" returns the tree to its state before the most recent expand or match operation. The parser maintains a list of tree locations called a "frontier" to remember which nodes have not yet been expanded and which leaves have not yet been matched against the text. The leftmost frontier node is shown in green, and the other frontier nodes are shown in blue. The parser always performs expand and match operations on the leftmost element of the frontier. You can control the parser's operation by using the "expand," "match," and "backtrack" buttons; or you can use the "step" button to let the parser automatically decide which operation to apply. The parser uses the following rules to decide which operation to apply: - If the leftmost frontier element is a token, try matching it. - If the leftmost frontier element is a node, try expanding it with the first untried expansion. - Otherwise, backtrack. The "expand" button applies the untried expansion whose CFG production is listed earliest in the grammar. To manually choose which expansion to apply, click on a CFG production from the list of available expansions, on the left side of the main window. The "autostep" button will let the parser continue applying applications to the tree until it reaches a complete parse. You can cancel an autostep in progress at any time by clicking on the "autostep" button again. Keyboard Shortcuts:: [Space]\t Perform the next expand, match, or backtrack operation [a]\t Step through operations until the next complete parse [e]\t Perform an expand operation [m]\t Perform a match operation [b]\t Perform a backtrack operation [Delete]\t Reset the parser [g]\t Show/hide available expansions list [h]\t Help [Ctrl-p]\t Print [q]\t Quit """ import nltk.compat import tkinter.font from tkinter import (Listbox, IntVar, Button, Frame, Label, Menu, Scrollbar, Tk) from nltk.tree import Tree from nltk.util import in_idle from nltk.parse import SteppingRecursiveDescentParser from nltk.draw.util import TextWidget, ShowText, CanvasFrame, EntryDialog from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment class RecursiveDescentApp(object): """ A graphical tool for exploring the recursive descent parser. The tool displays the parser's tree and the remaining text, and allows the user to control the parser's operation. In particular, the user can expand subtrees on the frontier, match tokens on the frontier against the text, and backtrack. A "step" button simply steps through the parsing process, performing the operations that ``RecursiveDescentParser`` would use. """ def __init__(self, grammar, sent, trace=0): self._sent = sent self._parser = SteppingRecursiveDescentParser(grammar, trace) # Set up the main window. self._top = Tk() self._top.title('Recursive Descent Parser Application') # Set up key bindings. self._init_bindings() # Initialize the fonts. self._init_fonts(self._top) # Animations. animating_lock is a lock to prevent the demo # from performing new operations while it's animating. self._animation_frames = IntVar(self._top) self._animation_frames.set(5) self._animating_lock = 0 self._autostep = 0 # The user can hide the grammar. self._show_grammar = IntVar(self._top) self._show_grammar.set(1) # Create the basic frames. self._init_menubar(self._top) self._init_buttons(self._top) self._init_feedback(self._top) self._init_grammar(self._top) self._init_canvas(self._top) # Initialize the parser. self._parser.initialize(self._sent) # Resize callback self._canvas.bind('', self._configure) ######################################### ## Initialization Helpers ######################################### def _init_fonts(self, root): # See: self._sysfont = tkinter.font.Font(font=Button()["font"]) root.option_add("*Font", self._sysfont) # TWhat's our font size (default=same as sysfont) self._size = IntVar(root) self._size.set(self._sysfont.cget('size')) self._boldfont = tkinter.font.Font(family='helvetica', weight='bold', size=self._size.get()) self._font = tkinter.font.Font(family='helvetica', size=self._size.get()) if self._size.get() < 0: big = self._size.get()-2 else: big = self._size.get()+2 self._bigfont = tkinter.font.Font(family='helvetica', weight='bold', size=big) def _init_grammar(self, parent): # Grammar view. self._prodframe = listframe = Frame(parent) self._prodframe.pack(fill='both', side='left', padx=2) self._prodlist_label = Label(self._prodframe, font=self._boldfont, text='Available Expansions') self._prodlist_label.pack() self._prodlist = Listbox(self._prodframe, selectmode='single', relief='groove', background='white', foreground='#909090', font=self._font, selectforeground='#004040', selectbackground='#c0f0c0') self._prodlist.pack(side='right', fill='both', expand=1) self._productions = list(self._parser.grammar().productions()) for production in self._productions: self._prodlist.insert('end', (' %s' % production)) self._prodlist.config(height=min(len(self._productions), 25)) # Add a scrollbar if there are more than 25 productions. if len(self._productions) > 25: listscroll = Scrollbar(self._prodframe, orient='vertical') self._prodlist.config(yscrollcommand = listscroll.set) listscroll.config(command=self._prodlist.yview) listscroll.pack(side='left', fill='y') # If they select a production, apply it. self._prodlist.bind('<>', self._prodlist_select) def _init_bindings(self): # Key bindings are a good thing. self._top.bind('', self.destroy) self._top.bind('', self.destroy) self._top.bind('', self.destroy) self._top.bind('e', self.expand) #self._top.bind('', self.expand) #self._top.bind('', self.expand) self._top.bind('m', self.match) self._top.bind('', self.match) self._top.bind('', self.match) self._top.bind('b', self.backtrack) self._top.bind('', self.backtrack) self._top.bind('', self.backtrack) self._top.bind('', self.backtrack) self._top.bind('', self.backtrack) self._top.bind('a', self.autostep) #self._top.bind('', self.autostep) self._top.bind('', self.autostep) self._top.bind('', self.cancel_autostep) self._top.bind('', self.step) self._top.bind('', self.reset) self._top.bind('', self.postscript) #self._top.bind('', self.help) #self._top.bind('', self.help) self._top.bind('', self.help) self._top.bind('', self.help) #self._top.bind('', self.toggle_grammar) #self._top.bind('', self.toggle_grammar) #self._top.bind('', self.toggle_grammar) self._top.bind('', self.edit_grammar) self._top.bind('', self.edit_sentence) def _init_buttons(self, parent): # Set up the frames. self._buttonframe = buttonframe = Frame(parent) buttonframe.pack(fill='none', side='bottom', padx=3, pady=2) Button(buttonframe, text='Step', background='#90c0d0', foreground='black', command=self.step,).pack(side='left') Button(buttonframe, text='Autostep', background='#90c0d0', foreground='black', command=self.autostep,).pack(side='left') Button(buttonframe, text='Expand', underline=0, background='#90f090', foreground='black', command=self.expand).pack(side='left') Button(buttonframe, text='Match', underline=0, background='#90f090', foreground='black', command=self.match).pack(side='left') Button(buttonframe, text='Backtrack', underline=0, background='#f0a0a0', foreground='black', command=self.backtrack).pack(side='left') # Replace autostep... # self._autostep_button = Button(buttonframe, text='Autostep', # underline=0, command=self.autostep) # self._autostep_button.pack(side='left') def _configure(self, event): self._autostep = 0 (x1, y1, x2, y2) = self._cframe.scrollregion() y2 = event.height - 6 self._canvas['scrollregion'] = '%d %d %d %d' % (x1,y1,x2,y2) self._redraw() def _init_feedback(self, parent): self._feedbackframe = feedbackframe = Frame(parent) feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3) self._lastoper_label = Label(feedbackframe, text='Last Operation:', font=self._font) self._lastoper_label.pack(side='left') lastoperframe = Frame(feedbackframe, relief='sunken', border=1) lastoperframe.pack(fill='x', side='right', expand=1, padx=5) self._lastoper1 = Label(lastoperframe, foreground='#007070', background='#f0f0f0', font=self._font) self._lastoper2 = Label(lastoperframe, anchor='w', width=30, foreground='#004040', background='#f0f0f0', font=self._font) self._lastoper1.pack(side='left') self._lastoper2.pack(side='left', fill='x', expand=1) def _init_canvas(self, parent): self._cframe = CanvasFrame(parent, background='white', #width=525, height=250, closeenough=10, border=2, relief='sunken') self._cframe.pack(expand=1, fill='both', side='top', pady=2) canvas = self._canvas = self._cframe.canvas() # Initially, there's no tree or text self._tree = None self._textwidgets = [] self._textline = None def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label='Reset Parser', underline=0, command=self.reset, accelerator='Del') filemenu.add_command(label='Print to Postscript', underline=0, command=self.postscript, accelerator='Ctrl-p') filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x') menubar.add_cascade(label='File', underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) editmenu.add_command(label='Edit Grammar', underline=5, command=self.edit_grammar, accelerator='Ctrl-g') editmenu.add_command(label='Edit Text', underline=5, command=self.edit_sentence, accelerator='Ctrl-t') menubar.add_cascade(label='Edit', underline=0, menu=editmenu) rulemenu = Menu(menubar, tearoff=0) rulemenu.add_command(label='Step', underline=1, command=self.step, accelerator='Space') rulemenu.add_separator() rulemenu.add_command(label='Match', underline=0, command=self.match, accelerator='Ctrl-m') rulemenu.add_command(label='Expand', underline=0, command=self.expand, accelerator='Ctrl-e') rulemenu.add_separator() rulemenu.add_command(label='Backtrack', underline=0, command=self.backtrack, accelerator='Ctrl-b') menubar.add_cascade(label='Apply', underline=0, menu=rulemenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_checkbutton(label="Show Grammar", underline=0, variable=self._show_grammar, command=self._toggle_grammar) viewmenu.add_separator() viewmenu.add_radiobutton(label='Tiny', variable=self._size, underline=0, value=10, command=self.resize) viewmenu.add_radiobutton(label='Small', variable=self._size, underline=0, value=12, command=self.resize) viewmenu.add_radiobutton(label='Medium', variable=self._size, underline=0, value=14, command=self.resize) viewmenu.add_radiobutton(label='Large', variable=self._size, underline=0, value=18, command=self.resize) viewmenu.add_radiobutton(label='Huge', variable=self._size, underline=0, value=24, command=self.resize) menubar.add_cascade(label='View', underline=0, menu=viewmenu) animatemenu = Menu(menubar, tearoff=0) animatemenu.add_radiobutton(label="No Animation", underline=0, variable=self._animation_frames, value=0) animatemenu.add_radiobutton(label="Slow Animation", underline=0, variable=self._animation_frames, value=10, accelerator='-') animatemenu.add_radiobutton(label="Normal Animation", underline=0, variable=self._animation_frames, value=5, accelerator='=') animatemenu.add_radiobutton(label="Fast Animation", underline=0, variable=self._animation_frames, value=2, accelerator='+') menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label='About', underline=0, command=self.about) helpmenu.add_command(label='Instructions', underline=0, command=self.help, accelerator='F1') menubar.add_cascade(label='Help', underline=0, menu=helpmenu) parent.config(menu=menubar) ######################################### ## Helper ######################################### def _get(self, widget, treeloc): for i in treeloc: widget = widget.subtrees()[i] if isinstance(widget, TreeSegmentWidget): widget = widget.label() return widget ######################################### ## Main draw procedure ######################################### def _redraw(self): canvas = self._canvas # Delete the old tree, widgets, etc. if self._tree is not None: self._cframe.destroy_widget(self._tree) for twidget in self._textwidgets: self._cframe.destroy_widget(twidget) if self._textline is not None: self._canvas.delete(self._textline) # Draw the tree. helv = ('helvetica', -self._size.get()) bold = ('helvetica', -self._size.get(), 'bold') attribs = {'tree_color': '#000000', 'tree_width': 2, 'node_font': bold, 'leaf_font': helv,} tree = self._parser.tree() self._tree = tree_to_treesegment(canvas, tree, **attribs) self._cframe.add_widget(self._tree, 30, 5) # Draw the text. helv = ('helvetica', -self._size.get()) bottom = y = self._cframe.scrollregion()[3] self._textwidgets = [TextWidget(canvas, word, font=self._font) for word in self._sent] for twidget in self._textwidgets: self._cframe.add_widget(twidget, 0, 0) twidget.move(0, bottom-twidget.bbox()[3]-5) y = min(y, twidget.bbox()[1]) # Draw a line over the text, to separate it from the tree. self._textline = canvas.create_line(-5000, y-5, 5000, y-5, dash='.') # Highlight appropriate nodes. self._highlight_nodes() self._highlight_prodlist() # Make sure the text lines up. self._position_text() def _redraw_quick(self): # This should be more-or-less sufficient after an animation. self._highlight_nodes() self._highlight_prodlist() self._position_text() def _highlight_nodes(self): # Highlight the list of nodes to be checked. bold = ('helvetica', -self._size.get(), 'bold') for treeloc in self._parser.frontier()[:1]: self._get(self._tree, treeloc)['color'] = '#20a050' self._get(self._tree, treeloc)['font'] = bold for treeloc in self._parser.frontier()[1:]: self._get(self._tree, treeloc)['color'] = '#008080' def _highlight_prodlist(self): # Highlight the productions that can be expanded. # Boy, too bad tkinter doesn't implement Listbox.itemconfig; # that would be pretty useful here. self._prodlist.delete(0, 'end') expandable = self._parser.expandable_productions() untried = self._parser.untried_expandable_productions() productions = self._productions for index in range(len(productions)): if productions[index] in expandable: if productions[index] in untried: self._prodlist.insert(index, ' %s' % productions[index]) else: self._prodlist.insert(index, ' %s (TRIED)' % productions[index]) self._prodlist.selection_set(index) else: self._prodlist.insert(index, ' %s' % productions[index]) def _position_text(self): # Line up the text widgets that are matched against the tree numwords = len(self._sent) num_matched = numwords - len(self._parser.remaining_text()) leaves = self._tree_leaves()[:num_matched] xmax = self._tree.bbox()[0] for i in range(0, len(leaves)): widget = self._textwidgets[i] leaf = leaves[i] widget['color'] = '#006040' leaf['color'] = '#006040' widget.move(leaf.bbox()[0] - widget.bbox()[0], 0) xmax = widget.bbox()[2] + 10 # Line up the text widgets that are not matched against the tree. for i in range(len(leaves), numwords): widget = self._textwidgets[i] widget['color'] = '#a0a0a0' widget.move(xmax - widget.bbox()[0], 0) xmax = widget.bbox()[2] + 10 # If we have a complete parse, make everything green :) if self._parser.currently_complete(): for twidget in self._textwidgets: twidget['color'] = '#00a000' # Move the matched leaves down to the text. for i in range(0, len(leaves)): widget = self._textwidgets[i] leaf = leaves[i] dy = widget.bbox()[1] - leaf.bbox()[3] - 10.0 dy = max(dy, leaf.parent().label().bbox()[3] - leaf.bbox()[3] + 10) leaf.move(0, dy) def _tree_leaves(self, tree=None): if tree is None: tree = self._tree if isinstance(tree, TreeSegmentWidget): leaves = [] for child in tree.subtrees(): leaves += self._tree_leaves(child) return leaves else: return [tree] ######################################### ## Button Callbacks ######################################### def destroy(self, *e): self._autostep = 0 if self._top is None: return self._top.destroy() self._top = None def reset(self, *e): self._autostep = 0 self._parser.initialize(self._sent) self._lastoper1['text'] = 'Reset Application' self._lastoper2['text'] = '' self._redraw() def autostep(self, *e): if self._animation_frames.get() == 0: self._animation_frames.set(2) if self._autostep: self._autostep = 0 else: self._autostep = 1 self._step() def cancel_autostep(self, *e): #self._autostep_button['text'] = 'Autostep' self._autostep = 0 # Make sure to stop auto-stepping if we get any user input. def step(self, *e): self._autostep = 0; self._step() def match(self, *e): self._autostep = 0; self._match() def expand(self, *e): self._autostep = 0; self._expand() def backtrack(self, *e): self._autostep = 0; self._backtrack() def _step(self): if self._animating_lock: return # Try expanding, matching, and backtracking (in that order) if self._expand(): pass elif self._parser.untried_match() and self._match(): pass elif self._backtrack(): pass else: self._lastoper1['text'] = 'Finished' self._lastoper2['text'] = '' self._autostep = 0 # Check if we just completed a parse. if self._parser.currently_complete(): self._autostep = 0 self._lastoper2['text'] += ' [COMPLETE PARSE]' def _expand(self, *e): if self._animating_lock: return old_frontier = self._parser.frontier() rv = self._parser.expand() if rv is not None: self._lastoper1['text'] = 'Expand:' self._lastoper2['text'] = rv self._prodlist.selection_clear(0, 'end') index = self._productions.index(rv) self._prodlist.selection_set(index) self._animate_expand(old_frontier[0]) return True else: self._lastoper1['text'] = 'Expand:' self._lastoper2['text'] = '(all expansions tried)' return False def _match(self, *e): if self._animating_lock: return old_frontier = self._parser.frontier() rv = self._parser.match() if rv is not None: self._lastoper1['text'] = 'Match:' self._lastoper2['text'] = rv self._animate_match(old_frontier[0]) return True else: self._lastoper1['text'] = 'Match:' self._lastoper2['text'] = '(failed)' return False def _backtrack(self, *e): if self._animating_lock: return if self._parser.backtrack(): elt = self._parser.tree() for i in self._parser.frontier()[0]: elt = elt[i] self._lastoper1['text'] = 'Backtrack' self._lastoper2['text'] = '' if isinstance(elt, Tree): self._animate_backtrack(self._parser.frontier()[0]) else: self._animate_match_backtrack(self._parser.frontier()[0]) return True else: self._autostep = 0 self._lastoper1['text'] = 'Finished' self._lastoper2['text'] = '' return False def about(self, *e): ABOUT = ("NLTK Recursive Descent Parser Application\n"+ "Written by Edward Loper") TITLE = 'About: Recursive Descent Parser Application' try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self._top, TITLE, ABOUT) def help(self, *e): self._autostep = 0 # The default font's not very legible; try using 'fixed' instead. try: ShowText(self._top, 'Help: Recursive Descent Parser Application', (__doc__ or '').strip(), width=75, font='fixed') except: ShowText(self._top, 'Help: Recursive Descent Parser Application', (__doc__ or '').strip(), width=75) def postscript(self, *e): self._autostep = 0 self._cframe.print_to_file() def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._boldfont.configure(size=-(abs(size))) self._sysfont.configure(size=-(abs(size))) self._bigfont.configure(size=-(abs(size+2))) self._redraw() ######################################### ## Expand Production Selection ######################################### def _toggle_grammar(self, *e): if self._show_grammar.get(): self._prodframe.pack(fill='both', side='left', padx=2, after=self._feedbackframe) self._lastoper1['text'] = 'Show Grammar' else: self._prodframe.pack_forget() self._lastoper1['text'] = 'Hide Grammar' self._lastoper2['text'] = '' # def toggle_grammar(self, *e): # self._show_grammar = not self._show_grammar # if self._show_grammar: # self._prodframe.pack(fill='both', expand='y', side='left', # after=self._feedbackframe) # self._lastoper1['text'] = 'Show Grammar' # else: # self._prodframe.pack_forget() # self._lastoper1['text'] = 'Hide Grammar' # self._lastoper2['text'] = '' def _prodlist_select(self, event): selection = self._prodlist.curselection() if len(selection) != 1: return index = int(selection[0]) old_frontier = self._parser.frontier() production = self._parser.expand(self._productions[index]) if production: self._lastoper1['text'] = 'Expand:' self._lastoper2['text'] = production self._prodlist.selection_clear(0, 'end') self._prodlist.selection_set(index) self._animate_expand(old_frontier[0]) else: # Reset the production selections. self._prodlist.selection_clear(0, 'end') for prod in self._parser.expandable_productions(): index = self._productions.index(prod) self._prodlist.selection_set(index) ######################################### ## Animation ######################################### def _animate_expand(self, treeloc): oldwidget = self._get(self._tree, treeloc) oldtree = oldwidget.parent() top = not isinstance(oldtree.parent(), TreeSegmentWidget) tree = self._parser.tree() for i in treeloc: tree = tree[i] widget = tree_to_treesegment(self._canvas, tree, node_font=self._boldfont, leaf_color='white', tree_width=2, tree_color='white', node_color='white', leaf_font=self._font) widget.label()['color'] = '#20a050' (oldx, oldy) = oldtree.label().bbox()[:2] (newx, newy) = widget.label().bbox()[:2] widget.move(oldx-newx, oldy-newy) if top: self._cframe.add_widget(widget, 0, 5) widget.move(30-widget.label().bbox()[0], 0) self._tree = widget else: oldtree.parent().replace_child(oldtree, widget) # Move the children over so they don't overlap. # Line the children up in a strange way. if widget.subtrees(): dx = (oldx + widget.label().width()/2 - widget.subtrees()[0].bbox()[0]/2 - widget.subtrees()[0].bbox()[2]/2) for subtree in widget.subtrees(): subtree.move(dx, 0) self._makeroom(widget) if top: self._cframe.destroy_widget(oldtree) else: oldtree.destroy() colors = ['gray%d' % (10*int(10*x/self._animation_frames.get())) for x in range(self._animation_frames.get(),0,-1)] # Move the text string down, if necessary. dy = widget.bbox()[3] + 30 - self._canvas.coords(self._textline)[1] if dy > 0: for twidget in self._textwidgets: twidget.move(0, dy) self._canvas.move(self._textline, 0, dy) self._animate_expand_frame(widget, colors) def _makeroom(self, treeseg): """ Make sure that no sibling tree bbox's overlap. """ parent = treeseg.parent() if not isinstance(parent, TreeSegmentWidget): return index = parent.subtrees().index(treeseg) # Handle siblings to the right rsiblings = parent.subtrees()[index+1:] if rsiblings: dx = treeseg.bbox()[2] - rsiblings[0].bbox()[0] + 10 for sibling in rsiblings: sibling.move(dx, 0) # Handle siblings to the left if index > 0: lsibling = parent.subtrees()[index-1] dx = max(0, lsibling.bbox()[2] - treeseg.bbox()[0] + 10) treeseg.move(dx, 0) # Keep working up the tree. self._makeroom(parent) def _animate_expand_frame(self, widget, colors): if len(colors) > 0: self._animating_lock = 1 widget['color'] = colors[0] for subtree in widget.subtrees(): if isinstance(subtree, TreeSegmentWidget): subtree.label()['color'] = colors[0] else: subtree['color'] = colors[0] self._top.after(50, self._animate_expand_frame, widget, colors[1:]) else: widget['color'] = 'black' for subtree in widget.subtrees(): if isinstance(subtree, TreeSegmentWidget): subtree.label()['color'] = 'black' else: subtree['color'] = 'black' self._redraw_quick() widget.label()['color'] = 'black' self._animating_lock = 0 if self._autostep: self._step() def _animate_backtrack(self, treeloc): # Flash red first, if we're animating. if self._animation_frames.get() == 0: colors = [] else: colors = ['#a00000', '#000000', '#a00000'] colors += ['gray%d' % (10*int(10*x/(self._animation_frames.get()))) for x in range(1, self._animation_frames.get()+1)] widgets = [self._get(self._tree, treeloc).parent()] for subtree in widgets[0].subtrees(): if isinstance(subtree, TreeSegmentWidget): widgets.append(subtree.label()) else: widgets.append(subtree) self._animate_backtrack_frame(widgets, colors) def _animate_backtrack_frame(self, widgets, colors): if len(colors) > 0: self._animating_lock = 1 for widget in widgets: widget['color'] = colors[0] self._top.after(50, self._animate_backtrack_frame, widgets, colors[1:]) else: for widget in widgets[0].subtrees(): widgets[0].remove_child(widget) widget.destroy() self._redraw_quick() self._animating_lock = 0 if self._autostep: self._step() def _animate_match_backtrack(self, treeloc): widget = self._get(self._tree, treeloc) node = widget.parent().label() dy = (1.0 * (node.bbox()[3] - widget.bbox()[1] + 14) / max(1, self._animation_frames.get())) self._animate_match_backtrack_frame(self._animation_frames.get(), widget, dy) def _animate_match(self, treeloc): widget = self._get(self._tree, treeloc) dy = ((self._textwidgets[0].bbox()[1] - widget.bbox()[3] - 10.0) / max(1, self._animation_frames.get())) self._animate_match_frame(self._animation_frames.get(), widget, dy) def _animate_match_frame(self, frame, widget, dy): if frame > 0: self._animating_lock = 1 widget.move(0, dy) self._top.after(10, self._animate_match_frame, frame-1, widget, dy) else: widget['color'] = '#006040' self._redraw_quick() self._animating_lock = 0 if self._autostep: self._step() def _animate_match_backtrack_frame(self, frame, widget, dy): if frame > 0: self._animating_lock = 1 widget.move(0, dy) self._top.after(10, self._animate_match_backtrack_frame, frame-1, widget, dy) else: widget.parent().remove_child(widget) widget.destroy() self._animating_lock = 0 if self._autostep: self._step() def edit_grammar(self, *e): CFGEditor(self._top, self._parser.grammar(), self.set_grammar) def set_grammar(self, grammar): self._parser.set_grammar(grammar) self._productions = list(grammar.productions()) self._prodlist.delete(0, 'end') for production in self._productions: self._prodlist.insert('end', (' %s' % production)) def edit_sentence(self, *e): sentence = " ".join(self._sent) title = 'Edit Text' instr = 'Enter a new sentence to parse.' EntryDialog(self._top, sentence, instr, self.set_sentence, title) def set_sentence(self, sentence): self._sent = sentence.split() #[XX] use tagged? self.reset() def app(): """ Create a recursive descent parser demo, using a simple grammar and text. """ from nltk.grammar import CFG grammar = CFG.fromstring(""" # Grammatical productions. S -> NP VP NP -> Det N PP | Det N VP -> V NP PP | V NP | V PP -> P NP # Lexical productions. NP -> 'I' Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' | 'telescope' V -> 'ate' | 'saw' P -> 'in' | 'under' | 'with' """) sent = 'the dog saw a man in the park'.split() RecursiveDescentApp(grammar, sent).mainloop() if __name__ == '__main__': app() __all__ = ['app'] nltk-3.1/nltk/app/srparser_app.py0000644000076500000240000010023712607224144016664 0ustar sbstaff00000000000000# Natural Language Toolkit: Shift-Reduce Parser Application # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A graphical tool for exploring the shift-reduce parser. The shift-reduce parser maintains a stack, which records the structure of the portion of the text that has been parsed. The stack is initially empty. Its contents are shown on the left side of the main canvas. On the right side of the main canvas is the remaining text. This is the portion of the text which has not yet been considered by the parser. The parser builds up a tree structure for the text using two operations: - "shift" moves the first token from the remaining text to the top of the stack. In the demo, the top of the stack is its right-hand side. - "reduce" uses a grammar production to combine the rightmost stack elements into a single tree token. You can control the parser's operation by using the "shift" and "reduce" buttons; or you can use the "step" button to let the parser automatically decide which operation to apply. The parser uses the following rules to decide which operation to apply: - Only shift if no reductions are available. - If multiple reductions are available, then apply the reduction whose CFG production is listed earliest in the grammar. The "reduce" button applies the reduction whose CFG production is listed earliest in the grammar. There are two ways to manually choose which reduction to apply: - Click on a CFG production from the list of available reductions, on the left side of the main window. The reduction based on that production will be applied to the top of the stack. - Click on one of the stack elements. A popup window will appear, containing all available reductions. Select one, and it will be applied to the top of the stack. Note that reductions can only be applied to the top of the stack. Keyboard Shortcuts:: [Space]\t Perform the next shift or reduce operation [s]\t Perform a shift operation [r]\t Perform a reduction operation [Ctrl-z]\t Undo most recent operation [Delete]\t Reset the parser [g]\t Show/hide available production list [Ctrl-a]\t Toggle animations [h]\t Help [Ctrl-p]\t Print [q]\t Quit """ """ Possible future improvements: - button/window to change and/or select text. Just pop up a window with an entry, and let them modify the text; and then retokenize it? Maybe give a warning if it contains tokens whose types are not in the grammar. - button/window to change and/or select grammar. Select from several alternative grammars? Or actually change the grammar? If the later, then I'd want to define nltk.draw.cfg, which would be responsible for that. """ import nltk.compat import tkinter.font from tkinter import (IntVar, Listbox, Button, Frame, Label, Menu, Scrollbar, Tk) from nltk.tree import Tree from nltk.parse import SteppingShiftReduceParser from nltk.util import in_idle from nltk.draw.util import CanvasFrame, EntryDialog, ShowText, TextWidget from nltk.draw import CFGEditor, TreeSegmentWidget, tree_to_treesegment class ShiftReduceApp(object): """ A graphical tool for exploring the shift-reduce parser. The tool displays the parser's stack and the remaining text, and allows the user to control the parser's operation. In particular, the user can shift tokens onto the stack, and can perform reductions on the top elements of the stack. A "step" button simply steps through the parsing process, performing the operations that ``nltk.parse.ShiftReduceParser`` would use. """ def __init__(self, grammar, sent, trace=0): self._sent = sent self._parser = SteppingShiftReduceParser(grammar, trace) # Set up the main window. self._top = Tk() self._top.title('Shift Reduce Parser Application') # Animations. animating_lock is a lock to prevent the demo # from performing new operations while it's animating. self._animating_lock = 0 self._animate = IntVar(self._top) self._animate.set(10) # = medium # The user can hide the grammar. self._show_grammar = IntVar(self._top) self._show_grammar.set(1) # Initialize fonts. self._init_fonts(self._top) # Set up key bindings. self._init_bindings() # Create the basic frames. self._init_menubar(self._top) self._init_buttons(self._top) self._init_feedback(self._top) self._init_grammar(self._top) self._init_canvas(self._top) # A popup menu for reducing. self._reduce_menu = Menu(self._canvas, tearoff=0) # Reset the demo, and set the feedback frame to empty. self.reset() self._lastoper1['text'] = '' ######################################### ## Initialization Helpers ######################################### def _init_fonts(self, root): # See: self._sysfont = tkinter.font.Font(font=Button()["font"]) root.option_add("*Font", self._sysfont) # TWhat's our font size (default=same as sysfont) self._size = IntVar(root) self._size.set(self._sysfont.cget('size')) self._boldfont = tkinter.font.Font(family='helvetica', weight='bold', size=self._size.get()) self._font = tkinter.font.Font(family='helvetica', size=self._size.get()) def _init_grammar(self, parent): # Grammar view. self._prodframe = listframe = Frame(parent) self._prodframe.pack(fill='both', side='left', padx=2) self._prodlist_label = Label(self._prodframe, font=self._boldfont, text='Available Reductions') self._prodlist_label.pack() self._prodlist = Listbox(self._prodframe, selectmode='single', relief='groove', background='white', foreground='#909090', font=self._font, selectforeground='#004040', selectbackground='#c0f0c0') self._prodlist.pack(side='right', fill='both', expand=1) self._productions = list(self._parser.grammar().productions()) for production in self._productions: self._prodlist.insert('end', (' %s' % production)) self._prodlist.config(height=min(len(self._productions), 25)) # Add a scrollbar if there are more than 25 productions. if 1:#len(self._productions) > 25: listscroll = Scrollbar(self._prodframe, orient='vertical') self._prodlist.config(yscrollcommand = listscroll.set) listscroll.config(command=self._prodlist.yview) listscroll.pack(side='left', fill='y') # If they select a production, apply it. self._prodlist.bind('<>', self._prodlist_select) # When they hover over a production, highlight it. self._hover = -1 self._prodlist.bind('', self._highlight_hover) self._prodlist.bind('', self._clear_hover) def _init_bindings(self): # Quit self._top.bind('', self.destroy) self._top.bind('', self.destroy) self._top.bind('', self.destroy) self._top.bind('', self.destroy) # Ops (step, shift, reduce, undo) self._top.bind('', self.step) self._top.bind('', self.shift) self._top.bind('', self.shift) self._top.bind('', self.shift) self._top.bind('', self.reduce) self._top.bind('', self.reduce) self._top.bind('', self.reduce) self._top.bind('', self.reset) self._top.bind('', self.undo) self._top.bind('', self.undo) self._top.bind('', self.undo) self._top.bind('', self.undo) self._top.bind('', self.undo) # Misc self._top.bind('', self.postscript) self._top.bind('', self.help) self._top.bind('', self.help) self._top.bind('', self.edit_grammar) self._top.bind('', self.edit_sentence) # Animation speed control self._top.bind('-', lambda e,a=self._animate:a.set(20)) self._top.bind('=', lambda e,a=self._animate:a.set(10)) self._top.bind('+', lambda e,a=self._animate:a.set(4)) def _init_buttons(self, parent): # Set up the frames. self._buttonframe = buttonframe = Frame(parent) buttonframe.pack(fill='none', side='bottom') Button(buttonframe, text='Step', background='#90c0d0', foreground='black', command=self.step,).pack(side='left') Button(buttonframe, text='Shift', underline=0, background='#90f090', foreground='black', command=self.shift).pack(side='left') Button(buttonframe, text='Reduce', underline=0, background='#90f090', foreground='black', command=self.reduce).pack(side='left') Button(buttonframe, text='Undo', underline=0, background='#f0a0a0', foreground='black', command=self.undo).pack(side='left') def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label='Reset Parser', underline=0, command=self.reset, accelerator='Del') filemenu.add_command(label='Print to Postscript', underline=0, command=self.postscript, accelerator='Ctrl-p') filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x') menubar.add_cascade(label='File', underline=0, menu=filemenu) editmenu = Menu(menubar, tearoff=0) editmenu.add_command(label='Edit Grammar', underline=5, command=self.edit_grammar, accelerator='Ctrl-g') editmenu.add_command(label='Edit Text', underline=5, command=self.edit_sentence, accelerator='Ctrl-t') menubar.add_cascade(label='Edit', underline=0, menu=editmenu) rulemenu = Menu(menubar, tearoff=0) rulemenu.add_command(label='Step', underline=1, command=self.step, accelerator='Space') rulemenu.add_separator() rulemenu.add_command(label='Shift', underline=0, command=self.shift, accelerator='Ctrl-s') rulemenu.add_command(label='Reduce', underline=0, command=self.reduce, accelerator='Ctrl-r') rulemenu.add_separator() rulemenu.add_command(label='Undo', underline=0, command=self.undo, accelerator='Ctrl-u') menubar.add_cascade(label='Apply', underline=0, menu=rulemenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_checkbutton(label="Show Grammar", underline=0, variable=self._show_grammar, command=self._toggle_grammar) viewmenu.add_separator() viewmenu.add_radiobutton(label='Tiny', variable=self._size, underline=0, value=10, command=self.resize) viewmenu.add_radiobutton(label='Small', variable=self._size, underline=0, value=12, command=self.resize) viewmenu.add_radiobutton(label='Medium', variable=self._size, underline=0, value=14, command=self.resize) viewmenu.add_radiobutton(label='Large', variable=self._size, underline=0, value=18, command=self.resize) viewmenu.add_radiobutton(label='Huge', variable=self._size, underline=0, value=24, command=self.resize) menubar.add_cascade(label='View', underline=0, menu=viewmenu) animatemenu = Menu(menubar, tearoff=0) animatemenu.add_radiobutton(label="No Animation", underline=0, variable=self._animate, value=0) animatemenu.add_radiobutton(label="Slow Animation", underline=0, variable=self._animate, value=20, accelerator='-') animatemenu.add_radiobutton(label="Normal Animation", underline=0, variable=self._animate, value=10, accelerator='=') animatemenu.add_radiobutton(label="Fast Animation", underline=0, variable=self._animate, value=4, accelerator='+') menubar.add_cascade(label="Animate", underline=1, menu=animatemenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label='About', underline=0, command=self.about) helpmenu.add_command(label='Instructions', underline=0, command=self.help, accelerator='F1') menubar.add_cascade(label='Help', underline=0, menu=helpmenu) parent.config(menu=menubar) def _init_feedback(self, parent): self._feedbackframe = feedbackframe = Frame(parent) feedbackframe.pack(fill='x', side='bottom', padx=3, pady=3) self._lastoper_label = Label(feedbackframe, text='Last Operation:', font=self._font) self._lastoper_label.pack(side='left') lastoperframe = Frame(feedbackframe, relief='sunken', border=1) lastoperframe.pack(fill='x', side='right', expand=1, padx=5) self._lastoper1 = Label(lastoperframe, foreground='#007070', background='#f0f0f0', font=self._font) self._lastoper2 = Label(lastoperframe, anchor='w', width=30, foreground='#004040', background='#f0f0f0', font=self._font) self._lastoper1.pack(side='left') self._lastoper2.pack(side='left', fill='x', expand=1) def _init_canvas(self, parent): self._cframe = CanvasFrame(parent, background='white', width=525, closeenough=10, border=2, relief='sunken') self._cframe.pack(expand=1, fill='both', side='top', pady=2) canvas = self._canvas = self._cframe.canvas() self._stackwidgets = [] self._rtextwidgets = [] self._titlebar = canvas.create_rectangle(0,0,0,0, fill='#c0f0f0', outline='black') self._exprline = canvas.create_line(0,0,0,0, dash='.') self._stacktop = canvas.create_line(0,0,0,0, fill='#408080') size = self._size.get()+4 self._stacklabel = TextWidget(canvas, 'Stack', color='#004040', font=self._boldfont) self._rtextlabel = TextWidget(canvas, 'Remaining Text', color='#004040', font=self._boldfont) self._cframe.add_widget(self._stacklabel) self._cframe.add_widget(self._rtextlabel) ######################################### ## Main draw procedure ######################################### def _redraw(self): scrollregion = self._canvas['scrollregion'].split() (cx1, cy1, cx2, cy2) = [int(c) for c in scrollregion] # Delete the old stack & rtext widgets. for stackwidget in self._stackwidgets: self._cframe.destroy_widget(stackwidget) self._stackwidgets = [] for rtextwidget in self._rtextwidgets: self._cframe.destroy_widget(rtextwidget) self._rtextwidgets = [] # Position the titlebar & exprline (x1, y1, x2, y2) = self._stacklabel.bbox() y = y2-y1+10 self._canvas.coords(self._titlebar, -5000, 0, 5000, y-4) self._canvas.coords(self._exprline, 0, y*2-10, 5000, y*2-10) # Position the titlebar labels.. (x1, y1, x2, y2) = self._stacklabel.bbox() self._stacklabel.move(5-x1, 3-y1) (x1, y1, x2, y2) = self._rtextlabel.bbox() self._rtextlabel.move(cx2-x2-5, 3-y1) # Draw the stack. stackx = 5 for tok in self._parser.stack(): if isinstance(tok, Tree): attribs = {'tree_color': '#4080a0', 'tree_width': 2, 'node_font': self._boldfont, 'node_color': '#006060', 'leaf_color': '#006060', 'leaf_font':self._font} widget = tree_to_treesegment(self._canvas, tok, **attribs) widget.label()['color'] = '#000000' else: widget = TextWidget(self._canvas, tok, color='#000000', font=self._font) widget.bind_click(self._popup_reduce) self._stackwidgets.append(widget) self._cframe.add_widget(widget, stackx, y) stackx = widget.bbox()[2] + 10 # Draw the remaining text. rtextwidth = 0 for tok in self._parser.remaining_text(): widget = TextWidget(self._canvas, tok, color='#000000', font=self._font) self._rtextwidgets.append(widget) self._cframe.add_widget(widget, rtextwidth, y) rtextwidth = widget.bbox()[2] + 4 # Allow enough room to shift the next token (for animations) if len(self._rtextwidgets) > 0: stackx += self._rtextwidgets[0].width() # Move the remaining text to the correct location (keep it # right-justified, when possible); and move the remaining text # label, if necessary. stackx = max(stackx, self._stacklabel.width()+25) rlabelwidth = self._rtextlabel.width()+10 if stackx >= cx2-max(rtextwidth, rlabelwidth): cx2 = stackx + max(rtextwidth, rlabelwidth) for rtextwidget in self._rtextwidgets: rtextwidget.move(4+cx2-rtextwidth, 0) self._rtextlabel.move(cx2-self._rtextlabel.bbox()[2]-5, 0) midx = (stackx + cx2-max(rtextwidth, rlabelwidth))/2 self._canvas.coords(self._stacktop, midx, 0, midx, 5000) (x1, y1, x2, y2) = self._stacklabel.bbox() # Set up binding to allow them to shift a token by dragging it. if len(self._rtextwidgets) > 0: def drag_shift(widget, midx=midx, self=self): if widget.bbox()[0] < midx: self.shift() else: self._redraw() self._rtextwidgets[0].bind_drag(drag_shift) self._rtextwidgets[0].bind_click(self.shift) # Draw the stack top. self._highlight_productions() def _draw_stack_top(self, widget): # hack.. midx = widget.bbox()[2]+50 self._canvas.coords(self._stacktop, midx, 0, midx, 5000) def _highlight_productions(self): # Highlight the productions that can be reduced. self._prodlist.selection_clear(0, 'end') for prod in self._parser.reducible_productions(): index = self._productions.index(prod) self._prodlist.selection_set(index) ######################################### ## Button Callbacks ######################################### def destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def reset(self, *e): self._parser.initialize(self._sent) self._lastoper1['text'] = 'Reset App' self._lastoper2['text'] = '' self._redraw() def step(self, *e): if self.reduce(): return True elif self.shift(): return True else: if list(self._parser.parses()): self._lastoper1['text'] = 'Finished:' self._lastoper2['text'] = 'Success' else: self._lastoper1['text'] = 'Finished:' self._lastoper2['text'] = 'Failure' def shift(self, *e): if self._animating_lock: return if self._parser.shift(): tok = self._parser.stack()[-1] self._lastoper1['text'] = 'Shift:' self._lastoper2['text'] = '%r' % tok if self._animate.get(): self._animate_shift() else: self._redraw() return True return False def reduce(self, *e): if self._animating_lock: return production = self._parser.reduce() if production: self._lastoper1['text'] = 'Reduce:' self._lastoper2['text'] = '%s' % production if self._animate.get(): self._animate_reduce() else: self._redraw() return production def undo(self, *e): if self._animating_lock: return if self._parser.undo(): self._redraw() def postscript(self, *e): self._cframe.print_to_file() def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) ######################################### ## Menubar callbacks ######################################### def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._boldfont.configure(size=-(abs(size))) self._sysfont.configure(size=-(abs(size))) #self._stacklabel['font'] = ('helvetica', -size-4, 'bold') #self._rtextlabel['font'] = ('helvetica', -size-4, 'bold') #self._lastoper_label['font'] = ('helvetica', -size) #self._lastoper1['font'] = ('helvetica', -size) #self._lastoper2['font'] = ('helvetica', -size) #self._prodlist['font'] = ('helvetica', -size) #self._prodlist_label['font'] = ('helvetica', -size-2, 'bold') self._redraw() def help(self, *e): # The default font's not very legible; try using 'fixed' instead. try: ShowText(self._top, 'Help: Shift-Reduce Parser Application', (__doc__ or '').strip(), width=75, font='fixed') except: ShowText(self._top, 'Help: Shift-Reduce Parser Application', (__doc__ or '').strip(), width=75) def about(self, *e): ABOUT = ("NLTK Shift-Reduce Parser Application\n"+ "Written by Edward Loper") TITLE = 'About: Shift-Reduce Parser Application' try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self._top, TITLE, ABOUT) def edit_grammar(self, *e): CFGEditor(self._top, self._parser.grammar(), self.set_grammar) def set_grammar(self, grammar): self._parser.set_grammar(grammar) self._productions = list(grammar.productions()) self._prodlist.delete(0, 'end') for production in self._productions: self._prodlist.insert('end', (' %s' % production)) def edit_sentence(self, *e): sentence = " ".join(self._sent) title = 'Edit Text' instr = 'Enter a new sentence to parse.' EntryDialog(self._top, sentence, instr, self.set_sentence, title) def set_sentence(self, sent): self._sent = sent.split() #[XX] use tagged? self.reset() ######################################### ## Reduce Production Selection ######################################### def _toggle_grammar(self, *e): if self._show_grammar.get(): self._prodframe.pack(fill='both', side='left', padx=2, after=self._feedbackframe) self._lastoper1['text'] = 'Show Grammar' else: self._prodframe.pack_forget() self._lastoper1['text'] = 'Hide Grammar' self._lastoper2['text'] = '' def _prodlist_select(self, event): selection = self._prodlist.curselection() if len(selection) != 1: return index = int(selection[0]) production = self._parser.reduce(self._productions[index]) if production: self._lastoper1['text'] = 'Reduce:' self._lastoper2['text'] = '%s' % production if self._animate.get(): self._animate_reduce() else: self._redraw() else: # Reset the production selections. self._prodlist.selection_clear(0, 'end') for prod in self._parser.reducible_productions(): index = self._productions.index(prod) self._prodlist.selection_set(index) def _popup_reduce(self, widget): # Remove old commands. productions = self._parser.reducible_productions() if len(productions) == 0: return self._reduce_menu.delete(0, 'end') for production in productions: self._reduce_menu.add_command(label=str(production), command=self.reduce) self._reduce_menu.post(self._canvas.winfo_pointerx(), self._canvas.winfo_pointery()) ######################################### ## Animations ######################################### def _animate_shift(self): # What widget are we shifting? widget = self._rtextwidgets[0] # Where are we shifting from & to? right = widget.bbox()[0] if len(self._stackwidgets) == 0: left = 5 else: left = self._stackwidgets[-1].bbox()[2]+10 # Start animating. dt = self._animate.get() dx = (left-right)*1.0/dt self._animate_shift_frame(dt, widget, dx) def _animate_shift_frame(self, frame, widget, dx): if frame > 0: self._animating_lock = 1 widget.move(dx, 0) self._top.after(10, self._animate_shift_frame, frame-1, widget, dx) else: # but: stacktop?? # Shift the widget to the stack. del self._rtextwidgets[0] self._stackwidgets.append(widget) self._animating_lock = 0 # Display the available productions. self._draw_stack_top(widget) self._highlight_productions() def _animate_reduce(self): # What widgets are we shifting? numwidgets = len(self._parser.stack()[-1]) # number of children widgets = self._stackwidgets[-numwidgets:] # How far are we moving? if isinstance(widgets[0], TreeSegmentWidget): ydist = 15 + widgets[0].label().height() else: ydist = 15 + widgets[0].height() # Start animating. dt = self._animate.get() dy = ydist*2.0/dt self._animate_reduce_frame(dt/2, widgets, dy) def _animate_reduce_frame(self, frame, widgets, dy): if frame > 0: self._animating_lock = 1 for widget in widgets: widget.move(0, dy) self._top.after(10, self._animate_reduce_frame, frame-1, widgets, dy) else: del self._stackwidgets[-len(widgets):] for widget in widgets: self._cframe.remove_widget(widget) tok = self._parser.stack()[-1] if not isinstance(tok, Tree): raise ValueError() label = TextWidget(self._canvas, str(tok.label()), color='#006060', font=self._boldfont) widget = TreeSegmentWidget(self._canvas, label, widgets, width=2) (x1, y1, x2, y2) = self._stacklabel.bbox() y = y2-y1+10 if not self._stackwidgets: x = 5 else: x = self._stackwidgets[-1].bbox()[2] + 10 self._cframe.add_widget(widget, x, y) self._stackwidgets.append(widget) # Display the available productions. self._draw_stack_top(widget) self._highlight_productions() # # Delete the old widgets.. # del self._stackwidgets[-len(widgets):] # for widget in widgets: # self._cframe.destroy_widget(widget) # # # Make a new one. # tok = self._parser.stack()[-1] # if isinstance(tok, Tree): # attribs = {'tree_color': '#4080a0', 'tree_width': 2, # 'node_font': bold, 'node_color': '#006060', # 'leaf_color': '#006060', 'leaf_font':self._font} # widget = tree_to_treesegment(self._canvas, tok.type(), # **attribs) # widget.node()['color'] = '#000000' # else: # widget = TextWidget(self._canvas, tok.type(), # color='#000000', font=self._font) # widget.bind_click(self._popup_reduce) # (x1, y1, x2, y2) = self._stacklabel.bbox() # y = y2-y1+10 # if not self._stackwidgets: x = 5 # else: x = self._stackwidgets[-1].bbox()[2] + 10 # self._cframe.add_widget(widget, x, y) # self._stackwidgets.append(widget) #self._redraw() self._animating_lock = 0 ######################################### ## Hovering. ######################################### def _highlight_hover(self, event): # What production are we hovering over? index = self._prodlist.nearest(event.y) if self._hover == index: return # Clear any previous hover highlighting. self._clear_hover() # If the production corresponds to an available reduction, # highlight the stack. selection = [int(s) for s in self._prodlist.curselection()] if index in selection: rhslen = len(self._productions[index].rhs()) for stackwidget in self._stackwidgets[-rhslen:]: if isinstance(stackwidget, TreeSegmentWidget): stackwidget.label()['color'] = '#00a000' else: stackwidget['color'] = '#00a000' # Remember what production we're hovering over. self._hover = index def _clear_hover(self, *event): # Clear any previous hover highlighting. if self._hover == -1: return self._hover = -1 for stackwidget in self._stackwidgets: if isinstance(stackwidget, TreeSegmentWidget): stackwidget.label()['color'] = 'black' else: stackwidget['color'] = 'black' def app(): """ Create a shift reduce parser app, using a simple grammar and text. """ from nltk.grammar import Nonterminal, Production, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) grammar = CFG(S, productions) # tokenize the sentence sent = 'my dog saw a man in the park with a statue'.split() ShiftReduceApp(grammar, sent).mainloop() if __name__ == '__main__': app() __all__ = ['app'] nltk-3.1/nltk/app/wordfreq_app.py0000644000076500000240000000161312607224144016652 0ustar sbstaff00000000000000# Natural Language Toolkit: Wordfreq Application # # Copyright (C) 2001-2015 NLTK Project # Author: Sumukh Ghodke # URL: # For license information, see LICENSE.TXT from matplotlib import pylab from nltk.text import Text from nltk.corpus import gutenberg def plot_word_freq_dist(text): fd = text.vocab() samples = [item for item, _ in fd.most_common(50)] values = [fd[sample] for sample in samples] values = [sum(values[:i+1]) * 100.0/fd.N() for i in range(len(values))] pylab.title(text.name) pylab.xlabel("Samples") pylab.ylabel("Cumulative Percentage") pylab.plot(values) pylab.xticks(range(len(samples)), [str(s) for s in samples], rotation=90) pylab.show() def app(): t1 = Text(gutenberg.words('melville-moby_dick.txt')) plot_word_freq_dist(t1) if __name__ == '__main__': app() __all__ = ['app'] nltk-3.1/nltk/app/wordnet_app.py0000644000076500000240000010317612607224144016512 0ustar sbstaff00000000000000# Natural Language Toolkit: WordNet Browser Application # # Copyright (C) 2001-2015 NLTK Project # Author: Jussi Salmela # Paul Bone # URL: # For license information, see LICENSE.TXT """ A WordNet Browser application which launches the default browser (if it is not already running) and opens a new tab with a connection to http://localhost:port/ . It also starts an HTTP server on the specified port and begins serving browser requests. The default port is 8000. (For command-line help, run "python wordnet -h") This application requires that the user's web browser supports Javascript. BrowServer is a server for browsing the NLTK Wordnet database It first launches a browser client to be used for browsing and then starts serving the requests of that and maybe other clients Usage:: browserver.py -h browserver.py [-s] [-p ] Options:: -h or --help Display this help message. -l or --log-file Logs messages to the given file, If this option is not specified messages are silently dropped. -p or --port Run the web server on this TCP port, defaults to 8000. -s or --server-mode Do not start a web browser, and do not allow a user to shotdown the server through the web interface. """ # TODO: throughout this package variable names and docstrings need # modifying to be compliant with NLTK's coding standards. Tests also # need to be develop to ensure this continues to work in the face of # changes to other NLTK packages. from __future__ import print_function # Allow this program to run inside the NLTK source tree. from sys import path import os import sys from sys import argv from collections import defaultdict import webbrowser import datetime import re import threading import time import getopt import base64 import pickle import copy from nltk import compat from nltk.corpus import wordnet as wn from nltk.corpus.reader.wordnet import Synset, Lemma if compat.PY3: from http.server import HTTPServer, BaseHTTPRequestHandler else: from BaseHTTPServer import HTTPServer, BaseHTTPRequestHandler # now included in local file # from util import html_header, html_trailer, \ # get_static_index_page, get_static_page_by_path, \ # page_from_word, page_from_href firstClient = True # True if we're not also running a web browser. The value f server_mode # gets set by demo(). server_mode = None # If set this is a file object for writting log messages. logfile = None class MyServerHandler(BaseHTTPRequestHandler): def do_HEAD(self): self.send_head() def do_GET(self): global firstClient sp = self.path[1:] if compat.unquote_plus(sp) == 'SHUTDOWN THE SERVER': if server_mode: page = "Server must be killed with SIGTERM." type = "text/plain" else: print('Server shutting down!') os._exit(0) elif sp == '': # First request. type = 'text/html' if not server_mode and firstClient: firstClient = False page = get_static_index_page(True) else: page = get_static_index_page(False) word = 'green' elif sp.endswith('.html'): # Trying to fetch a HTML file TODO: type = 'text/html' usp = compat.unquote_plus(sp) if usp == 'NLTK Wordnet Browser Database Info.html': word = '* Database Info *' if os.path.isfile(usp): with open(usp, 'r') as infile: page = infile.read() else: page = (html_header % word) + \ '

The database info file:'\ '

' + usp + '' + \ '

was not found. Run this:' + \ '

python dbinfo_html.py' + \ '

to produce it.' + html_trailer else: # Handle files here. word = sp page = get_static_page_by_path(usp) elif sp.startswith("search"): # This doesn't seem to work with MWEs. type = 'text/html' parts = (sp.split("?")[1]).split("&") word = [p.split("=")[1].replace("+", " ") for p in parts if p.startswith("nextWord")][0] page, word = page_from_word(word) elif sp.startswith("lookup_"): # TODO add a variation of this that takes a non ecoded word or MWE. type = 'text/html' sp = sp[len("lookup_"):] page, word = page_from_href(sp) elif sp == "start_page": # if this is the first request we should display help # information, and possibly set a default word. type = 'text/html' page, word = page_from_word("wordnet") else: type = 'text/plain' page = "Could not parse request: '%s'" % sp # Send result. self.send_head(type) self.wfile.write(page.encode('utf8')) def send_head(self, type=None): self.send_response(200) self.send_header('Content-type', type) self.end_headers() def log_message(self, format, *args): global logfile if logfile: logfile.write( "%s - - [%s] %s\n" % (self.address_string(), self.log_date_time_string(), format%args)) def get_unique_counter_from_url(sp): """ Extract the unique counter from the URL if it has one. Otherwise return null. """ pos = sp.rfind('%23') if pos != -1: return int(sp[(pos + 3):]) else: return None def wnb(port=8000, runBrowser=True, logfilename=None): """ Run NLTK Wordnet Browser Server. :param port: The port number for the server to listen on, defaults to 8000 :type port: int :param runBrowser: True to start a web browser and point it at the web server. :type runBrowser: bool """ # The webbrowser module is unpredictable, typically it blocks if it uses # a console web browser, and doesn't block if it uses a GUI webbrowser, # so we need to force it to have a clear correct behaviour. # # Normally the server should run for as long as the user wants. they # should idealy be able to control this from the UI by closing the # window or tab. Second best would be clicking a button to say # 'Shutdown' that first shutsdown the server and closes the window or # tab, or exits the text-mode browser. Both of these are unfreasable. # # The next best alternative is to start the server, have it close when # it receives SIGTERM (default), and run the browser as well. The user # may have to shutdown both programs. # # Since webbrowser may block, and the webserver will block, we must run # them in separate threads. # global server_mode, logfile server_mode = not runBrowser # Setup logging. if logfilename: try: logfile = open(logfilename, "a", 1) # 1 means 'line buffering' except IOError as e: sys.stderr.write("Couldn't open %s for writing: %s", logfilename, e) sys.exit(1) else: logfile = None # Compute URL and start web browser url = 'http://localhost:' + str(port) server_ready = None browser_thread = None if runBrowser: server_ready = threading.Event() browser_thread = startBrowser(url, server_ready) # Start the server. server = HTTPServer(('', port), MyServerHandler) if logfile: logfile.write( 'NLTK Wordnet browser server running serving: %s\n' % url) if runBrowser: server_ready.set() try: server.serve_forever() except KeyboardInterrupt: pass if runBrowser: browser_thread.join() if logfile: logfile.close() def startBrowser(url, server_ready): def run(): server_ready.wait() time.sleep(1) # Wait a little bit more, there's still the chance of # a race condition. webbrowser.open(url, new = 2, autoraise = 1) t = threading.Thread(target=run) t.start() return t ##################################################################### # Utilities ##################################################################### """ WordNet Browser Utilities. This provides a backend to both wxbrowse and browserver.py. """ ################################################################################ # # Main logic for wordnet browser. # # This is wrapped inside a function since wn is only available if the # WordNet corpus is installed. def _pos_tuples(): return [ (wn.NOUN,'N','noun'), (wn.VERB,'V','verb'), (wn.ADJ,'J','adj'), (wn.ADV,'R','adv')] def _pos_match(pos_tuple): """ This function returns the complete pos tuple for the partial pos tuple given to it. It attempts to match it against the first non-null component of the given pos tuple. """ if pos_tuple[0] == 's': pos_tuple = ('a', pos_tuple[1], pos_tuple[2]) for n,x in enumerate(pos_tuple): if x is not None: break for pt in _pos_tuples(): if pt[n] == pos_tuple[n]: return pt return None HYPONYM = 0 HYPERNYM = 1 CLASS_REGIONAL = 2 PART_HOLONYM = 3 PART_MERONYM = 4 ATTRIBUTE = 5 SUBSTANCE_HOLONYM = 6 SUBSTANCE_MERONYM = 7 MEMBER_HOLONYM = 8 MEMBER_MERONYM = 9 VERB_GROUP = 10 INSTANCE_HYPONYM = 12 INSTANCE_HYPERNYM = 13 CAUSE = 14 ALSO_SEE = 15 SIMILAR = 16 ENTAILMENT = 17 ANTONYM = 18 FRAMES = 19 PERTAINYM = 20 CLASS_CATEGORY = 21 CLASS_USAGE = 22 CLASS_REGIONAL = 23 CLASS_USAGE = 24 CLASS_CATEGORY = 11 DERIVATIONALLY_RELATED_FORM = 25 INDIRECT_HYPERNYMS = 26 def lemma_property(word, synset, func): def flattern(l): if l == []: return [] else: return l[0] + flattern(l[1:]) return flattern([func(l) for l in synset.lemmas if l.name == word]) def rebuild_tree(orig_tree): node = orig_tree[0] children = orig_tree[1:] return (node, [rebuild_tree(t) for t in children]) def get_relations_data(word, synset): """ Get synset relations data for a synset. Note that this doesn't yet support things such as full hyponym vs direct hyponym. """ if synset.pos() == wn.NOUN: return ((HYPONYM, 'Hyponyms', synset.hyponyms()), (INSTANCE_HYPONYM , 'Instance hyponyms', synset.instance_hyponyms()), (HYPERNYM, 'Direct hypernyms', synset.hypernyms()), (INDIRECT_HYPERNYMS, 'Indirect hypernyms', rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1]), # hypernyms', 'Sister terms', (INSTANCE_HYPERNYM , 'Instance hypernyms', synset.instance_hypernyms()), # (CLASS_REGIONAL, ['domain term region'], ), (PART_HOLONYM, 'Part holonyms', synset.part_holonyms()), (PART_MERONYM, 'Part meronyms', synset.part_meronyms()), (SUBSTANCE_HOLONYM, 'Substance holonyms', synset.substance_holonyms()), (SUBSTANCE_MERONYM, 'Substance meronyms', synset.substance_meronyms()), (MEMBER_HOLONYM, 'Member holonyms', synset.member_holonyms()), (MEMBER_MERONYM, 'Member meronyms', synset.member_meronyms()), (ATTRIBUTE, 'Attributes', synset.attributes()), (ANTONYM, "Antonyms", lemma_property(word, synset, lambda l: l.antonyms())), (DERIVATIONALLY_RELATED_FORM, "Derivationally related form", lemma_property(word, synset, lambda l: l.derivationally_related_forms()))) elif synset.pos() == wn.VERB: return ((ANTONYM, 'Antonym', lemma_property(word, synset, lambda l: l.antonyms())), (HYPONYM, 'Hyponym', synset.hyponyms()), (HYPERNYM, 'Direct hypernyms', synset.hypernyms()), (INDIRECT_HYPERNYMS, 'Indirect hypernyms', rebuild_tree(synset.tree(lambda x: x.hypernyms()))[1]), (ENTAILMENT, 'Entailments', synset.entailments()), (CAUSE, 'Causes', synset.causes()), (ALSO_SEE, 'Also see', synset.also_sees()), (VERB_GROUP, 'Verb Groups', synset.verb_groups()), (DERIVATIONALLY_RELATED_FORM, "Derivationally related form", lemma_property(word, synset, lambda l: l.derivationally_related_forms()))) elif synset.pos() == wn.ADJ or synset.pos == wn.ADJ_SAT: return ((ANTONYM, 'Antonym', lemma_property(word, synset, lambda l: l.antonyms())), (SIMILAR, 'Similar to', synset.similar_tos()), # Participle of verb - not supported by corpus (PERTAINYM, 'Pertainyms', lemma_property(word, synset, lambda l: l.pertainyms())), (ATTRIBUTE, 'Attributes', synset.attributes()), (ALSO_SEE, 'Also see', synset.also_sees())) elif synset.pos() == wn.ADV: # This is weird. adverbs such as 'quick' and 'fast' don't seem # to have antonyms returned by the corpus.a return ((ANTONYM, 'Antonym', lemma_property(word, synset, lambda l: l.antonyms())),) # Derived from adjective - not supported by corpus else: raise TypeError("Unhandles synset POS type: " + str(synset.pos())) html_header = ''' NLTK Wordnet Browser display of: %s ''' html_trailer = ''' ''' explanation = '''

Search Help

  • The display below the line is an example of the output the browser shows you when you enter a search word. The search word was green.
  • The search result shows for different parts of speech the synsets i.e. different meanings for the word.
  • All underlined texts are hypertext links. There are two types of links: word links and others. Clicking a word link carries out a search for the word in the Wordnet database.
  • Clicking a link of the other type opens a display section of data attached to that link. Clicking that link a second time closes the section again.
  • Clicking S: opens a section showing the relations for that synset.
  • Clicking on a relation name opens a section that displays the associated synsets.
  • Type a search word in the Word field and start the search by the Enter/Return key or click the Search button.

''' # HTML oriented functions def _bold(txt): return '%s' % txt def _center(txt): return '
%s
' % txt def _hlev(n,txt): return '%s' % (n,txt,n) def _italic(txt): return '%s' % txt def _li(txt): return '
  • %s
  • ' % txt def pg(word, body): ''' Return a HTML page of NLTK Browser format constructed from the word and body :param word: The word that the body corresponds to :type word: str :param body: The HTML body corresponding to the word :type body: str :return: a HTML page for the word-body combination :rtype: str ''' return (html_header % word) + body + html_trailer def _ul(txt): return '
      ' + txt + '
    ' def _abbc(txt): """ abbc = asterisks, breaks, bold, center """ return _center(_bold('
    '*10 + '*'*10 + ' ' + txt + ' ' + '*'*10)) full_hyponym_cont_text = \ _ul(_li(_italic('(has full hyponym continuation)'))) + '\n' def _get_synset(synset_key): """ The synset key is the unique name of the synset, this can be retrived via synset.name() """ return wn.synset(synset_key) def _collect_one_synset(word, synset, synset_relations): ''' Returns the HTML string for one synset or word :param word: the current word :type word: str :param synset: a synset :type synset: synset :param synset_relations: information about which synset relations to display. :type synset_relations: dict(synset_key, set(relation_id)) :return: The HTML string built for this synset :rtype: str ''' if isinstance(synset, tuple): # It's a word raise NotImplementedError("word not supported by _collect_one_synset") typ = 'S' pos_tuple = _pos_match((synset.pos(), None, None)) assert pos_tuple is not None, "pos_tuple is null: synset.pos(): %s" % synset.pos() descr = pos_tuple[2] ref = copy.deepcopy(Reference(word, synset_relations)) ref.toggle_synset(synset) synset_label = typ + ";" if synset.name() in synset_relations: synset_label = _bold(synset_label) s = '
  • %s (%s) ' % (make_lookup_link(ref, synset_label), descr) def format_lemma(w): w = w.replace('_', ' ') if w.lower() == word: return _bold(w) else: ref = Reference(w) return make_lookup_link(ref, w) s += ', '.join(format_lemma(l.name()) for l in synset.lemmas()) gl = " (%s) %s " % \ (synset.definition(), "; ".join("\"%s\"" % e for e in synset.examples())) return s + gl + _synset_relations(word, synset, synset_relations) + '
  • \n' def _collect_all_synsets(word, pos, synset_relations=dict()): """ Return a HTML unordered list of synsets for the given word and part of speech. """ return '
      %s\n
    \n' % \ ''.join((_collect_one_synset(word, synset, synset_relations) for synset in wn.synsets(word, pos))) def _synset_relations(word, synset, synset_relations): ''' Builds the HTML string for the relations of a synset :param word: The current word :type word: str :param synset: The synset for which we're building the relations. :type synset: Synset :param synset_relations: synset keys and relation types for which to display relations. :type synset_relations: dict(synset_key, set(relation_type)) :return: The HTML for a synset's relations :rtype: str ''' if not synset.name() in synset_relations: return "" ref = Reference(word, synset_relations) def relation_html(r): if isinstance(r, Synset): return make_lookup_link(Reference(r.lemma_names()[0]), r.lemma_names()[0]) elif isinstance(r, Lemma): return relation_html(r.synset()) elif isinstance(r, tuple): # It's probably a tuple containing a Synset and a list of # similar tuples. This forms a tree of synsets. return "%s\n
      %s
    \n" % \ (relation_html(r[0]), ''.join('
  • %s
  • \n' % relation_html(sr) for sr in r[1])) else: raise TypeError("r must be a synset, lemma or list, it was: type(r) = %s, r = %s" % (type(r), r)) def make_synset_html(db_name, disp_name, rels): synset_html = '%s\n' % \ make_lookup_link( copy.deepcopy(ref).toggle_synset_relation(synset, db_name).encode(), disp_name) if db_name in ref.synset_relations[synset.name()]: synset_html += '
      %s
    \n' % \ ''.join("
  • %s
  • \n" % relation_html(r) for r in rels) return synset_html html = '
      ' + \ '\n'.join(("
    • %s
    • " % make_synset_html(*rel_data) for rel_data in get_relations_data(word, synset) if rel_data[2] != [])) + \ '
    ' return html class Reference(object): """ A reference to a page that may be generated by page_word """ def __init__(self, word, synset_relations=dict()): """ Build a reference to a new page. word is the word or words (separated by commas) for which to search for synsets of synset_relations is a dictionary of synset keys to sets of synset relation identifaiers to unfold a list of synset relations for. """ self.word = word self.synset_relations = synset_relations def encode(self): """ Encode this reference into a string to be used in a URL. """ # This uses a tuple rather than an object since the python # pickle representation is much smaller and there is no need # to represent the complete object. string = pickle.dumps((self.word, self.synset_relations), -1) return base64.urlsafe_b64encode(string).decode() @staticmethod def decode(string): """ Decode a reference encoded with Reference.encode """ string = base64.urlsafe_b64decode(string.encode()) word, synset_relations = pickle.loads(string) return Reference(word, synset_relations) def toggle_synset_relation(self, synset, relation): """ Toggle the display of the relations for the given synset and relation type. This function will throw a KeyError if the synset is currently not being displayed. """ if relation in self.synset_relations[synset.name()]: self.synset_relations[synset.name()].remove(relation) else: self.synset_relations[synset.name()].add(relation) return self def toggle_synset(self, synset): """ Toggle displaying of the relation types for the given synset """ if synset.name() in self.synset_relations: del self.synset_relations[synset.name()] else: self.synset_relations[synset.name()] = set() return self def make_lookup_link(ref, label): return '%s' % (ref.encode(), label) def page_from_word(word): """ Return a HTML page for the given word. :param word: The currently active word :type word: str :return: A tuple (page,word), where page is the new current HTML page to be sent to the browser and word is the new current word :rtype: A tuple (str,str) """ return page_from_reference(Reference(word)) def page_from_href(href): ''' Returns a tuple of the HTML page built and the new current word :param href: The hypertext reference to be solved :type href: str :return: A tuple (page,word), where page is the new current HTML page to be sent to the browser and word is the new current word :rtype: A tuple (str,str) ''' return page_from_reference(Reference.decode(href)) def page_from_reference(href): ''' Returns a tuple of the HTML page built and the new current word :param href: The hypertext reference to be solved :type href: str :return: A tuple (page,word), where page is the new current HTML page to be sent to the browser and word is the new current word :rtype: A tuple (str,str) ''' word = href.word pos_forms = defaultdict(list) words = word.split(',') words = [w for w in [w.strip().lower().replace(' ', '_') for w in words] if w != ""] if len(words) == 0: # No words were found. return "", "Please specify a word to search for." # This looks up multiple words at once. This is probably not # necessary and may lead to problems. for w in words: for pos in [wn.NOUN, wn.VERB, wn.ADJ, wn.ADV]: form = wn.morphy(w, pos) if form and form not in pos_forms[pos]: pos_forms[pos].append(form) body = '' for pos,pos_str,name in _pos_tuples(): if pos in pos_forms: body += _hlev(3, name) + '\n' for w in pos_forms[pos]: # Not all words of exc files are in the database, skip # to the next word if a KeyError is raised. try: body += _collect_all_synsets(w, pos, href.synset_relations) except KeyError: pass if not body: body = "The word or words '%s' where not found in the dictonary." % word return body, word ##################################################################### # Static pages ##################################################################### def get_static_page_by_path(path): """ Return a static HTML page from the path given. """ if path == "index_2.html": return get_static_index_page(False) elif path == "index.html": return get_static_index_page(True) elif path == "NLTK Wordnet Browser Database Info.html": return "Display of Wordnet Database Statistics is not supported" elif path == "upper_2.html": return get_static_upper_page(False) elif path == "upper.html": return get_static_upper_page(True) elif path == "web_help.html": return get_static_web_help_page() elif path == "wx_help.html": return get_static_wx_help_page() else: return "Internal error: Path for static page '%s' is unknown" % path def get_static_web_help_page(): """ Return the static web help page. """ return \ """ NLTK Wordnet Browser display of: * Help *

    NLTK Wordnet Browser Help

    The NLTK Wordnet Browser is a tool to use in browsing the Wordnet database. It tries to behave like the Wordnet project's web browser but the difference is that the NLTK Wordnet Browser uses a local Wordnet database.

    You are using the Javascript client part of the NLTK Wordnet BrowseServer. We assume your browser is in tab sheets enabled mode.

    For background information on Wordnet, see the Wordnet project home page: http://wordnet.princeton.edu/. For more information on the NLTK project, see the project home: http://nltk.sourceforge.net/. To get an idea of what the Wordnet version used by this browser includes choose Show Database Info from the View submenu.

    Word search

    The word to be searched is typed into the New Word field and the search started with Enter or by clicking the Search button. There is no uppercase/lowercase distinction: the search word is transformed to lowercase before the search.

    In addition, the word does not have to be in base form. The browser tries to find the possible base form(s) by making certain morphological substitutions. Typing fLIeS as an obscure example gives one this. Click the previous link to see what this kind of search looks like and then come back to this page by using the Alt+LeftArrow key combination.

    The result of a search is a display of one or more synsets for every part of speech in which a form of the search word was found to occur. A synset is a set of words having the same sense or meaning. Each word in a synset that is underlined is a hyperlink which can be clicked to trigger an automatic search for that word.

    Every synset has a hyperlink S: at the start of its display line. Clicking that symbol shows you the name of every relation that this synset is part of. Every relation name is a hyperlink that opens up a display for that relation. Clicking it another time closes the display again. Clicking another relation name on a line that has an opened relation closes the open relation and opens the clicked relation.

    It is also possible to give two or more words or collocations to be searched at the same time separating them with a comma like this cheer up,clear up, for example. Click the previous link to see what this kind of search looks like and then come back to this page by using the Alt+LeftArrow key combination. As you could see the search result includes the synsets found in the same order than the forms were given in the search field.

    There are also word level (lexical) relations recorded in the Wordnet database. Opening this kind of relation displays lines with a hyperlink W: at their beginning. Clicking this link shows more info on the word in question.

    The Buttons

    The Search and Help buttons need no more explanation.

    The Show Database Info button shows a collection of Wordnet database statistics.

    The Shutdown the Server button is shown for the first client of the BrowServer program i.e. for the client that is automatically launched when the BrowServer is started but not for the succeeding clients in order to protect the server from accidental shutdowns.

    """ def get_static_welcome_message(): """ Get the static welcome page. """ return \ """

    Search Help

    • The display below the line is an example of the output the browser shows you when you enter a search word. The search word was green.
    • The search result shows for different parts of speech the synsets i.e. different meanings for the word.
    • All underlined texts are hypertext links. There are two types of links: word links and others. Clicking a word link carries out a search for the word in the Wordnet database.
    • Clicking a link of the other type opens a display section of data attached to that link. Clicking that link a second time closes the section again.
    • Clicking S: opens a section showing the relations for that synset.
    • Clicking on a relation name opens a section that displays the associated synsets.
    • Type a search word in the Next Word field and start the search by the Enter/Return key or click the Search button.
    """ def get_static_index_page(with_shutdown): """ Get the static index page. """ template = \ """ NLTK Wordnet Browser """ if with_shutdown: upper_link = "upper.html" else: upper_link = "upper_2.html" return template % upper_link def get_static_upper_page(with_shutdown): """ Return the upper frame page, If with_shutdown is True then a 'shutdown' button is also provided to shutdown the server. """ template = \ """ Untitled Document
    Current Word:  Next Word: 
    Help %s """ if with_shutdown: shutdown_link = "Shutdown" else: shutdown_link = "" return template % shutdown_link def usage(): """ Display the command line help message. """ print(__doc__) def app(): # Parse and interpret options. (opts, _) = getopt.getopt(argv[1:], "l:p:sh", ["logfile=", "port=", "server-mode", "help"]) port = 8000 server_mode = False help_mode = False logfilename = None for (opt, value) in opts: if (opt == "-l") or (opt == "--logfile"): logfilename = str(value) elif (opt == "-p") or (opt == "--port"): port = int(value) elif (opt == "-s") or (opt == "--server-mode"): server_mode = True elif (opt == "-h") or (opt == "--help"): help_mode = True if help_mode: usage() else: wnb(port, not server_mode, logfilename) if __name__ == '__main__': app() __all__ = ['app'] nltk-3.1/nltk/book.py0000644000076500000240000000634612607524434014350 0ustar sbstaff00000000000000# Natural Language Toolkit: Some texts for exploration in chapter 1 of the book # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # # URL: # For license information, see LICENSE.TXT from __future__ import print_function from nltk.corpus import (gutenberg, genesis, inaugural, nps_chat, webtext, treebank, wordnet) from nltk.text import Text from nltk.probability import FreqDist print("*** Introductory Examples for the NLTK Book ***") print("Loading text1, ..., text9 and sent1, ..., sent9") print("Type the name of the text or sentence to view it.") print("Type: 'texts()' or 'sents()' to list the materials.") text1 = Text(gutenberg.words('melville-moby_dick.txt')) print("text1:", text1.name) text2 = Text(gutenberg.words('austen-sense.txt')) print("text2:", text2.name) text3 = Text(genesis.words('english-kjv.txt'), name="The Book of Genesis") print("text3:", text3.name) text4 = Text(inaugural.words(), name="Inaugural Address Corpus") print("text4:", text4.name) text5 = Text(nps_chat.words(), name="Chat Corpus") print("text5:", text5.name) text6 = Text(webtext.words('grail.txt'), name="Monty Python and the Holy Grail") print("text6:", text6.name) text7 = Text(treebank.words(), name="Wall Street Journal") print("text7:", text7.name) text8 = Text(webtext.words('singles.txt'), name="Personals Corpus") print("text8:", text8.name) text9 = Text(gutenberg.words('chesterton-thursday.txt')) print("text9:", text9.name) def texts(): print("text1:", text1.name) print("text2:", text2.name) print("text3:", text3.name) print("text4:", text4.name) print("text5:", text5.name) print("text6:", text6.name) print("text7:", text7.name) print("text8:", text8.name) print("text9:", text9.name) sent1 = ["Call", "me", "Ishmael", "."] sent2 = ["The", "family", "of", "Dashwood", "had", "long", "been", "settled", "in", "Sussex", "."] sent3 = ["In", "the", "beginning", "God", "created", "the", "heaven", "and", "the", "earth", "."] sent4 = ["Fellow", "-", "Citizens", "of", "the", "Senate", "and", "of", "the", "House", "of", "Representatives", ":"] sent5 = ["I", "have", "a", "problem", "with", "people", "PMing", "me", "to", "lol", "JOIN"] sent6 = ['SCENE', '1', ':', '[', 'wind', ']', '[', 'clop', 'clop', 'clop', ']', 'KING', 'ARTHUR', ':', 'Whoa', 'there', '!'] sent7 = ["Pierre", "Vinken", ",", "61", "years", "old", ",", "will", "join", "the", "board", "as", "a", "nonexecutive", "director", "Nov.", "29", "."] sent8 = ['25', 'SEXY', 'MALE', ',', 'seeks', 'attrac', 'older', 'single', 'lady', ',', 'for', 'discreet', 'encounters', '.'] sent9 = ["THE", "suburb", "of", "Saffron", "Park", "lay", "on", "the", "sunset", "side", "of", "London", ",", "as", "red", "and", "ragged", "as", "a", "cloud", "of", "sunset", "."] def sents(): print("sent1:", " ".join(sent1)) print("sent2:", " ".join(sent2)) print("sent3:", " ".join(sent3)) print("sent4:", " ".join(sent4)) print("sent5:", " ".join(sent5)) print("sent6:", " ".join(sent6)) print("sent7:", " ".join(sent7)) print("sent8:", " ".join(sent8)) print("sent9:", " ".join(sent9)) nltk-3.1/nltk/ccg/0000755000076500000240000000000012610001541013546 5ustar sbstaff00000000000000nltk-3.1/nltk/ccg/__init__.py0000644000076500000240000000175312607224144015701 0ustar sbstaff00000000000000# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2015 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT """ Combinatory Categorial Grammar. For more information see nltk/doc/contrib/ccg/ccg.pdf """ from nltk.ccg.combinator import (UndirectedBinaryCombinator, DirectedBinaryCombinator, ForwardCombinator, BackwardCombinator, UndirectedFunctionApplication, ForwardApplication, BackwardApplication, UndirectedComposition, ForwardComposition, BackwardComposition, BackwardBx, UndirectedSubstitution, ForwardSubstitution, BackwardSx, UndirectedTypeRaise, ForwardT, BackwardT) from nltk.ccg.chart import CCGEdge, CCGLeafEdge, CCGChartParser, CCGChart from nltk.ccg.lexicon import CCGLexicon nltk-3.1/nltk/ccg/api.py0000644000076500000240000002340012607224144014704 0ustar sbstaff00000000000000# Natural Language Toolkit: CCG Categories # # Copyright (C) 2001-2015 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals from nltk.internals import raise_unorderable_types from nltk.compat import (total_ordering, python_2_unicode_compatible, unicode_repr) @total_ordering class AbstractCCGCategory(object): ''' Interface for categories in combinatory grammars. ''' # Returns true if the category is primitive def is_primitive(self): raise NotImplementedError() # Returns true if the category is a function application def is_function(self): raise NotImplementedError() # Returns true if the category is a variable def is_var(self): raise NotImplementedError() # Takes a set of (var, category) substitutions, and replaces every # occurrence of the variable with the corresponding category def substitute(self,substitutions): raise NotImplementedError() # Determines whether two categories can be unified. # - Returns None if they cannot be unified # - Returns a list of necessary substitutions if they can.''' def can_unify(self,other): raise NotImplementedError() # Utility functions: comparison, strings and hashing. def __str__(self): raise NotImplementedError() def __eq__(self, other): return (self.__class__ is other.__class__ and self._comparison_key == other._comparison_key) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, AbstractCCGCategory): raise_unorderable_types("<", self, other) if self.__class__ is other.__class__: return self._comparison_key < other._comparison_key else: return self.__class__.__name__ < other.__class__.__name__ def __hash__(self): try: return self._hash except AttributeError: self._hash = hash(self._comparison_key) return self._hash @python_2_unicode_compatible class CCGVar(AbstractCCGCategory): ''' Class representing a variable CCG category. Used for conjunctions (and possibly type-raising, if implemented as a unary rule). ''' _maxID = 0 def __init__(self, prim_only=False): """Initialize a variable (selects a new identifier) :param prim_only: a boolean that determines whether the variable is restricted to primitives :type prim_only: bool """ self._id = self.new_id() self._prim_only = prim_only self._comparison_key = self._id @classmethod def new_id(cls): """A class method allowing generation of unique variable identifiers.""" cls._maxID = cls._maxID + 1 return cls._maxID - 1 def is_primitive(self): return False def is_function(self): return False def is_var(self): return True def substitute(self, substitutions): """If there is a substitution corresponding to this variable, return the substituted category. """ for (var,cat) in substitutions: if var == self: return cat return self def can_unify(self, other): """ If the variable can be replaced with other a substitution is returned. """ if other.is_primitive() or not self._prim_only: return [(self,other)] return None def id(self): return self._id def __str__(self): return "_var" + str(self._id) @total_ordering @python_2_unicode_compatible class Direction(object): ''' Class representing the direction of a function application. Also contains maintains information as to which combinators may be used with the category. ''' def __init__(self,dir,restrictions): self._dir = dir self._restrs = restrictions self._comparison_key = (dir, tuple(restrictions)) # Testing the application direction def is_forward(self): return self._dir == '/' def is_backward(self): return self._dir == '\\' def dir(self): return self._dir def restrs(self): """A list of restrictions on the combinators. '.' denotes that permuting operations are disallowed ',' denotes that function composition is disallowed '_' denotes that the direction has variable restrictions. (This is redundant in the current implementation of type-raising) """ return self._restrs def is_variable(self): return self._restrs == '_' # Unification and substitution of variable directions. # Used only if type-raising is implemented as a unary rule, as it # must inherit restrictions from the argument category. def can_unify(self,other): if other.is_variable(): return [('_',self.restrs())] elif self.is_variable(): return [('_',other.restrs())] else: if self.restrs() == other.restrs(): return [] return None def substitute(self,subs): if not self.is_variable(): return self for (var, restrs) in subs: if var == '_': return Direction(self._dir,restrs) return self # Testing permitted combinators def can_compose(self): return not ',' in self._restrs def can_cross(self): return not '.' in self._restrs def __eq__(self, other): return (self.__class__ is other.__class__ and self._comparison_key == other._comparison_key) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, Direction): raise_unorderable_types("<", self, other) if self.__class__ is other.__class__: return self._comparison_key < other._comparison_key else: return self.__class__.__name__ < other.__class__.__name__ def __hash__(self): try: return self._hash except AttributeError: self._hash = hash(self._comparison_key) return self._hash def __str__(self): r_str = "" for r in self._restrs: r_str = r_str + "%s" % r return "%s%s" % (self._dir, r_str) # The negation operator reverses the direction of the application def __neg__(self): if self._dir == '/': return Direction('\\',self._restrs) else: return Direction('/',self._restrs) @python_2_unicode_compatible class PrimitiveCategory(AbstractCCGCategory): ''' Class representing primitive categories. Takes a string representation of the category, and a list of strings specifying the morphological subcategories. ''' def __init__(self, categ, restrictions=[]): self._categ = categ self._restrs = restrictions self._comparison_key = (categ, tuple(restrictions)) def is_primitive(self): return True def is_function(self): return False def is_var(self): return False def restrs(self): return self._restrs def categ(self): return self._categ # Substitution does nothing to a primitive category def substitute(self,subs): return self # A primitive can be unified with a class of the same # base category, given that the other category shares all # of its subclasses, or with a variable. def can_unify(self,other): if not other.is_primitive(): return None if other.is_var(): return [(other,self)] if other.categ() == self.categ(): for restr in self._restrs: if restr not in other.restrs(): return None return [] return None def __str__(self): if self._restrs == []: return "%s" % self._categ restrictions = "[%s]" % ",".join(unicode_repr(r) for r in self._restrs) return "%s%s" % (self._categ, restrictions) @python_2_unicode_compatible class FunctionalCategory(AbstractCCGCategory): ''' Class that represents a function application category. Consists of argument and result categories, together with an application direction. ''' def __init__(self, res, arg, dir): self._res = res self._arg = arg self._dir = dir self._comparison_key = (arg, dir, res) def is_primitive(self): return False def is_function(self): return True def is_var(self): return False # Substitution returns the category consisting of the # substitution applied to each of its constituents. def substitute(self,subs): sub_res = self._res.substitute(subs) sub_dir = self._dir.substitute(subs) sub_arg = self._arg.substitute(subs) return FunctionalCategory(sub_res,sub_arg,self._dir) # A function can unify with another function, so long as its # constituents can unify, or with an unrestricted variable. def can_unify(self,other): if other.is_var(): return [(other,self)] if other.is_function(): sa = self._res.can_unify(other.res()) sd = self._dir.can_unify(other.dir()) if sa is not None and sd is not None: sb = self._arg.substitute(sa).can_unify(other.arg().substitute(sa)) if sb is not None: return sa + sb return None # Constituent accessors def arg(self): return self._arg def res(self): return self._res def dir(self): return self._dir def __str__(self): return "(%s%s%s)" % (self._res, self._dir, self._arg) nltk-3.1/nltk/ccg/chart.py0000644000076500000240000002760112607224144015243 0ustar sbstaff00000000000000# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2015 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT """ The lexicon is constructed by calling ``lexicon.fromstring()``. In order to construct a parser, you also need a rule set. The standard English rules are provided in chart as ``chart.DefaultRuleSet``. The parser can then be constructed by calling, for example: ``parser = chart.CCGChartParser(, )`` Parsing is then performed by running ``parser.parse(.split())``. While this returns a list of trees, the default representation of the produced trees is not very enlightening, particularly given that it uses the same tree class as the CFG parsers. It is probably better to call: ``chart.printCCGDerivation()`` which should print a nice representation of the derivation. This entire process is shown far more clearly in the demonstration: python chart.py """ from __future__ import print_function, division, unicode_literals import itertools from nltk.parse import ParserI from nltk.parse.chart import AbstractChartRule, EdgeI, Chart from nltk.tree import Tree from nltk.ccg.lexicon import fromstring from nltk.ccg.combinator import (ForwardT, BackwardT, ForwardApplication, BackwardApplication, ForwardComposition, BackwardComposition, ForwardSubstitution, BackwardBx, BackwardSx) from nltk.compat import python_2_unicode_compatible, string_types # Based on the EdgeI class from NLTK. # A number of the properties of the EdgeI interface don't # transfer well to CCGs, however. class CCGEdge(EdgeI): def __init__(self, span, categ, rule): self._span = span self._categ = categ self._rule = rule self._comparison_key = (span, categ, rule) # Accessors def lhs(self): return self._categ def span(self): return self._span def start(self): return self._span[0] def end(self): return self._span[1] def length(self): return self._span[1] - self.span[0] def rhs(self): return () def dot(self): return 0 def is_complete(self): return True def is_incomplete(self): return False def nextsym(self): return None def categ(self): return self._categ def rule(self): return self._rule class CCGLeafEdge(EdgeI): ''' Class representing leaf edges in a CCG derivation. ''' def __init__(self, pos, categ, leaf): self._pos = pos self._categ = categ self._leaf = leaf self._comparison_key = (pos, categ, leaf) # Accessors def lhs(self): return self._categ def span(self): return (self._pos, self._pos+1) def start(self): return self._pos def end(self): return self._pos + 1 def length(self): return 1 def rhs(self): return self._leaf def dot(self): return 0 def is_complete(self): return True def is_incomplete(self): return False def nextsym(self): return None def categ(self): return self._categ def leaf(self): return self._leaf @python_2_unicode_compatible class BinaryCombinatorRule(AbstractChartRule): ''' Class implementing application of a binary combinator to a chart. Takes the directed combinator to apply. ''' NUMEDGES = 2 def __init__(self,combinator): self._combinator = combinator # Apply a combinator def apply(self, chart, grammar, left_edge, right_edge): # The left & right edges must be touching. if not (left_edge.end() == right_edge.start()): return # Check if the two edges are permitted to combine. # If so, generate the corresponding edge. if self._combinator.can_combine(left_edge.categ(),right_edge.categ()): for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): new_edge = CCGEdge(span=(left_edge.start(), right_edge.end()),categ=res,rule=self._combinator) if chart.insert(new_edge,(left_edge,right_edge)): yield new_edge # The representation of the combinator (for printing derivations) def __str__(self): return "%s" % self._combinator # Type-raising must be handled slightly differently to the other rules, as the # resulting rules only span a single edge, rather than both edges. @python_2_unicode_compatible class ForwardTypeRaiseRule(AbstractChartRule): ''' Class for applying forward type raising ''' NUMEDGES = 2 def __init__(self): self._combinator = ForwardT def apply(self, chart, grammar, left_edge, right_edge): if not (left_edge.end() == right_edge.start()): return for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): new_edge = CCGEdge(span=left_edge.span(),categ=res,rule=self._combinator) if chart.insert(new_edge,(left_edge,)): yield new_edge def __str__(self): return "%s" % self._combinator @python_2_unicode_compatible class BackwardTypeRaiseRule(AbstractChartRule): ''' Class for applying backward type raising. ''' NUMEDGES = 2 def __init__(self): self._combinator = BackwardT def apply(self, chart, grammar, left_edge, right_edge): if not (left_edge.end() == right_edge.start()): return for res in self._combinator.combine(left_edge.categ(), right_edge.categ()): new_edge = CCGEdge(span=right_edge.span(),categ=res,rule=self._combinator) if chart.insert(new_edge,(right_edge,)): yield new_edge def __str__(self): return "%s" % self._combinator # Common sets of combinators used for English derivations. ApplicationRuleSet = [BinaryCombinatorRule(ForwardApplication), BinaryCombinatorRule(BackwardApplication)] CompositionRuleSet = [BinaryCombinatorRule(ForwardComposition), BinaryCombinatorRule(BackwardComposition), BinaryCombinatorRule(BackwardBx)] SubstitutionRuleSet = [BinaryCombinatorRule(ForwardSubstitution), BinaryCombinatorRule(BackwardSx)] TypeRaiseRuleSet = [ForwardTypeRaiseRule(), BackwardTypeRaiseRule()] # The standard English rule set. DefaultRuleSet = ApplicationRuleSet + CompositionRuleSet + \ SubstitutionRuleSet + TypeRaiseRuleSet class CCGChartParser(ParserI): ''' Chart parser for CCGs. Based largely on the ChartParser class from NLTK. ''' def __init__(self, lexicon, rules, trace=0): self._lexicon = lexicon self._rules = rules self._trace = trace def lexicon(self): return self._lexicon # Implements the CYK algorithm def parse(self, tokens): tokens = list(tokens) chart = CCGChart(list(tokens)) lex = self._lexicon # Initialize leaf edges. for index in range(chart.num_leaves()): for cat in lex.categories(chart.leaf(index)): new_edge = CCGLeafEdge(index, cat, chart.leaf(index)) chart.insert(new_edge, ()) # Select a span for the new edges for span in range(2,chart.num_leaves()+1): for start in range(0,chart.num_leaves()-span+1): # Try all possible pairs of edges that could generate # an edge for that span for part in range(1,span): lstart = start mid = start + part rend = start + span for left in chart.select(span=(lstart,mid)): for right in chart.select(span=(mid,rend)): # Generate all possible combinations of the two edges for rule in self._rules: edges_added_by_rule = 0 for newedge in rule.apply(chart,lex,left,right): edges_added_by_rule += 1 # Output the resulting parses return chart.parses(lex.start()) class CCGChart(Chart): def __init__(self, tokens): Chart.__init__(self, tokens) # Constructs the trees for a given parse. Unfortnunately, the parse trees need to be # constructed slightly differently to those in the default Chart class, so it has to # be reimplemented def _trees(self, edge, complete, memo, tree_class): assert complete, "CCGChart cannot build incomplete trees" if edge in memo: return memo[edge] if isinstance(edge,CCGLeafEdge): word = tree_class(edge.lhs(), [self._tokens[edge.start()]]) leaf = tree_class((edge.lhs(), "Leaf"), [word]) memo[edge] = [leaf] return [leaf] memo[edge] = [] trees = [] lhs = (edge.lhs(), "%s" % edge.rule()) for cpl in self.child_pointer_lists(edge): child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl] for children in itertools.product(*child_choices): trees.append(tree_class(lhs, children)) memo[edge] = trees return trees #-------- # Displaying derivations #-------- def printCCGDerivation(tree): # Get the leaves and initial categories leafcats = tree.pos() leafstr = '' catstr = '' # Construct a string with both the leaf word and corresponding # category aligned. for (leaf, cat) in leafcats: str_cat = "%s" % cat # print(cat.__class__) # print("str_cat", str_cat) nextlen = 2 + max(len(leaf), len(str_cat)) lcatlen = (nextlen - len(str_cat)) // 2 rcatlen = lcatlen + (nextlen - len(str_cat)) % 2 catstr += ' '*lcatlen + str_cat + ' '*rcatlen lleaflen = (nextlen - len(leaf)) // 2 rleaflen = lleaflen + (nextlen - len(leaf)) % 2 leafstr += ' '*lleaflen + leaf + ' '*rleaflen print(leafstr) print(catstr) # Display the derivation steps printCCGTree(0,tree) # Prints the sequence of derivation steps. def printCCGTree(lwidth,tree): rwidth = lwidth # Is a leaf (word). # Increment the span by the space occupied by the leaf. if not isinstance(tree,Tree): return 2 + lwidth + len(tree) # Find the width of the current derivation step for child in tree: rwidth = max(rwidth, printCCGTree(rwidth,child)) # Is a leaf node. # Don't print anything, but account for the space occupied. if not isinstance(tree.label(), tuple): return max(rwidth,2 + lwidth + len("%s" % tree.label()), 2 + lwidth + len(tree[0])) (res,op) = tree.label() # Pad to the left with spaces, followed by a sequence of '-' # and the derivation rule. print(lwidth*' ' + (rwidth-lwidth)*'-' + "%s" % op) # Print the resulting category on a new line. str_res = "%s" % res respadlen = (rwidth - lwidth - len(str_res)) // 2 + lwidth print(respadlen*' ' + str_res) return rwidth ### Demonstration code # Construct the lexicon lex = fromstring(''' :- S, NP, N, VP # Primitive categories, S is the target primitive Det :: NP/N # Family of words Pro :: NP TV :: VP/NP Modal :: (S\\NP)/VP # Backslashes need to be escaped I => Pro # Word -> Category mapping you => Pro the => Det # Variables have the special keyword 'var' # '.' prevents permutation # ',' prevents composition and => var\\.,var/.,var which => (N\\N)/(S/NP) will => Modal # Categories can be either explicit, or families. might => Modal cook => TV eat => TV mushrooms => N parsnips => N bacon => N ''') def demo(): parser = CCGChartParser(lex, DefaultRuleSet) for parse in parser.parse("I might cook and eat the bacon".split()): printCCGDerivation(parse) if __name__ == '__main__': demo() nltk-3.1/nltk/ccg/combinator.py0000644000076500000240000002475612607224144016307 0ustar sbstaff00000000000000# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2015 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT """ CCG Combinators """ from __future__ import unicode_literals from nltk.compat import python_2_unicode_compatible from nltk.ccg.api import FunctionalCategory class UndirectedBinaryCombinator(object): """ Abstract class for representing a binary combinator. Merely defines functions for checking if the function and argument are able to be combined, and what the resulting category is. Note that as no assumptions are made as to direction, the unrestricted combinators can perform all backward, forward and crossed variations of the combinators; these restrictions must be added in the rule class. """ def can_combine(self, function, argument): raise NotImplementedError() def combine (self, function, argument): raise NotImplementedError() class DirectedBinaryCombinator(object): """ Wrapper for the undirected binary combinator. It takes left and right categories, and decides which is to be the function, and which the argument. It then decides whether or not they can be combined. """ def can_combine(self, left, right): raise NotImplementedError() def combine(self, left, right): raise NotImplementedError() @python_2_unicode_compatible class ForwardCombinator(DirectedBinaryCombinator): """ Class representing combinators where the primary functor is on the left. Takes an undirected combinator, and a predicate which adds constraints restricting the cases in which it may apply. """ def __init__(self, combinator, predicate, suffix=''): self._combinator = combinator self._predicate = predicate self._suffix = suffix def can_combine(self, left, right): return (self._combinator.can_combine(left, right) and self._predicate(left, right)) def combine(self, left, right): for cat in self._combinator.combine(left, right): yield cat def __str__(self): return ">%s%s" % (self._combinator, self._suffix) @python_2_unicode_compatible class BackwardCombinator(DirectedBinaryCombinator): """ The backward equivalent of the ForwardCombinator class. """ def __init__(self, combinator, predicate, suffix=''): self._combinator = combinator self._predicate = predicate self._suffix = suffix def can_combine(self, left, right): return (self._combinator.can_combine(right, left) and self._predicate(left, right)) def combine(self, left, right): for cat in self._combinator.combine(right, left): yield cat def __str__(self): return "<%s%s" % (self._combinator, self._suffix) @python_2_unicode_compatible class UndirectedFunctionApplication(UndirectedBinaryCombinator): """ Class representing function application. Implements rules of the form: X/Y Y -> X (>) And the corresponding backwards application rule """ def can_combine(self, function, argument): if not function.is_function(): return False return not function.arg().can_unify(argument) is None def combine(self, function, argument): if not function.is_function(): return subs = function.arg().can_unify(argument) if subs is None: return yield function.res().substitute(subs) def __str__(self): return '' # Predicates for function application. # Ensures the left functor takes an argument on the right def forwardOnly(left, right): return left.dir().is_forward() # Ensures the right functor takes an argument on the left def backwardOnly(left, right): return right.dir().is_backward() # Application combinator instances ForwardApplication = ForwardCombinator(UndirectedFunctionApplication(), forwardOnly) BackwardApplication = BackwardCombinator(UndirectedFunctionApplication(), backwardOnly) @python_2_unicode_compatible class UndirectedComposition(UndirectedBinaryCombinator): """ Functional composition (harmonic) combinator. Implements rules of the form X/Y Y/Z -> X/Z (B>) And the corresponding backwards and crossed variations. """ def can_combine(self, function, argument): # Can only combine two functions, and both functions must # allow composition. if not (function.is_function() and argument.is_function()): return False if function.dir().can_compose() and argument.dir().can_compose(): return not function.arg().can_unify(argument.res()) is None return False def combine(self, function, argument): if not (function.is_function() and argument.is_function()): return if function.dir().can_compose() and argument.dir().can_compose(): subs = function.arg().can_unify(argument.res()) if not subs is None: yield FunctionalCategory(function.res().substitute(subs), argument.arg().substitute(subs), argument.dir()) def __str__(self): return 'B' # Predicates for restricting application of straight composition. def bothForward(left, right): return left.dir().is_forward() and right.dir().is_forward() def bothBackward(left, right): return left.dir().is_backward() and right.dir().is_backward() # Predicates for crossed composition def crossedDirs(left, right): return left.dir().is_forward() and right.dir().is_backward() def backwardBxConstraint(left, right): # The functors must be crossed inwards if not crossedDirs(left, right): return False # Permuting combinators must be allowed if not left.dir().can_cross() and right.dir().can_cross(): return False # The resulting argument category is restricted to be primitive return left.arg().is_primitive() # Straight composition combinators ForwardComposition = ForwardCombinator(UndirectedComposition(), forwardOnly) BackwardComposition = BackwardCombinator(UndirectedComposition(), backwardOnly) # Backward crossed composition BackwardBx = BackwardCombinator(UndirectedComposition(), backwardBxConstraint, suffix='x') @python_2_unicode_compatible class UndirectedSubstitution(UndirectedBinaryCombinator): """ Substitution (permutation) combinator. Implements rules of the form Y/Z (X\Y)/Z -> X/Z ( N\N def innermostFunction(categ): while categ.res().is_function(): categ = categ.res() return categ @python_2_unicode_compatible class UndirectedTypeRaise(UndirectedBinaryCombinator): """ Undirected combinator for type raising. """ def can_combine(self, function, arg): # The argument must be a function. # The restriction that arg.res() must be a function # merely reduces redundant type-raising; if arg.res() is # primitive, we have: # X Y\X =>((>) Y # which is equivalent to # X Y\X =>(<) Y if not (arg.is_function() and arg.res().is_function()): return False arg = innermostFunction(arg) # left, arg_categ are undefined! subs = left.can_unify(arg_categ.arg()) if subs is not None: return True return False def combine(self, function, arg): if not (function.is_primitive() and arg.is_function() and arg.res().is_function()): return # Type-raising matches only the innermost application. arg = innermostFunction(arg) subs = function.can_unify(arg.arg()) if subs is not None: xcat = arg.res().substitute(subs) yield FunctionalCategory(xcat, FunctionalCategory(xcat, function, arg.dir()), -(arg.dir())) def __str__(self): return 'T' # Predicates for type-raising # The direction of the innermost category must be towards # the primary functor. # The restriction that the variable must be primitive is not # common to all versions of CCGs; some authors have other restrictions. def forwardTConstraint(left, right): arg = innermostFunction(right) return arg.dir().is_backward() and arg.res().is_primitive() def backwardTConstraint(left, right): arg = innermostFunction(left) return arg.dir().is_forward() and arg.res().is_primitive() # Instances of type-raising combinators ForwardT = ForwardCombinator(UndirectedTypeRaise(), forwardTConstraint) BackwardT = BackwardCombinator(UndirectedTypeRaise(), backwardTConstraint) nltk-3.1/nltk/ccg/lexicon.py0000644000076500000240000001761312607224144015605 0ustar sbstaff00000000000000# Natural Language Toolkit: Combinatory Categorial Grammar # # Copyright (C) 2001-2015 NLTK Project # Author: Graeme Gange # URL: # For license information, see LICENSE.TXT """ CCG Lexicons """ from __future__ import unicode_literals import re from collections import defaultdict from nltk.ccg.api import PrimitiveCategory, Direction, CCGVar, FunctionalCategory from nltk.compat import python_2_unicode_compatible from nltk.internals import deprecated #------------ # Regular expressions used for parsing components of the lexicon #------------ # Parses a primitive category and subscripts PRIM_RE = re.compile(r'''([A-Za-z]+)(\[[A-Za-z,]+\])?''') # Separates the next primitive category from the remainder of the # string NEXTPRIM_RE = re.compile(r'''([A-Za-z]+(?:\[[A-Za-z,]+\])?)(.*)''') # Separates the next application operator from the remainder APP_RE = re.compile(r'''([\\/])([.,]?)([.,]?)(.*)''') # Parses the definition of the category of either a word or a family LEX_RE = re.compile(r'''([\w_]+)\s*(::|[-=]+>)\s*(.+)''', re.UNICODE) # Strips comments from a line COMMENTS_RE = re.compile('''([^#]*)(?:#.*)?''') #---------- # Lexicons #---------- @python_2_unicode_compatible class CCGLexicon(object): """ Class representing a lexicon for CCG grammars. * `primitives`: The list of primitive categories for the lexicon * `families`: Families of categories * `entries`: A mapping of words to possible categories """ def __init__(self, start, primitives, families, entries): self._start = PrimitiveCategory(start) self._primitives = primitives self._families = families self._entries = entries def categories(self, word): """ Returns all the possible categories for a word """ return self._entries[word] def start(self): """ Return the target category for the parser """ return self._start def __str__(self): """ String representation of the lexicon. Used for debugging. """ string = "" first = True for ident in self._entries: if not first: string = string + "\n" string = string + ident + " => " first = True for cat in self._entries[ident]: if not first: string = string + " | " else: first = False string = string + "%s" % cat return string #----------- # Parsing lexicons #----------- def matchBrackets(string): """ Separate the contents matching the first set of brackets from the rest of the input. """ rest = string[1:] inside = "(" while rest != "" and not rest.startswith(')'): if rest.startswith('('): (part, rest) = matchBrackets(rest) inside = inside + part else: inside = inside + rest[0] rest = rest[1:] if rest.startswith(')'): return (inside + ')', rest[1:]) raise AssertionError('Unmatched bracket in string \'' + string + '\'') def nextCategory(string): """ Separate the string for the next portion of the category from the rest of the string """ if string.startswith('('): return matchBrackets(string) return NEXTPRIM_RE.match(string).groups() def parseApplication(app): """ Parse an application operator """ return Direction(app[0], app[1:]) def parseSubscripts(subscr): """ Parse the subscripts for a primitive category """ if subscr: return subscr[1:-1].split(',') return [] def parsePrimitiveCategory(chunks, primitives, families, var): """ Parse a primitive category If the primitive is the special category 'var', replace it with the correct `CCGVar`. """ if chunks[0] == "var": if chunks[1] is None: if var is None: var = CCGVar() return (var, var) catstr = chunks[0] if catstr in families: (cat, cvar) = families[catstr] if var is None: var = cvar else: cat = cat.substitute([(cvar, var)]) return (cat, var) if catstr in primitives: subscrs = parseSubscripts(chunks[1]) return (PrimitiveCategory(catstr, subscrs), var) raise AssertionError('String \'' + catstr + '\' is neither a family nor primitive category.') def parseCategory(line, primitives, families): """ Drop the 'var' from the tuple """ return augParseCategory(line, primitives, families)[0] def augParseCategory(line, primitives, families, var=None): """ Parse a string representing a category, and returns a tuple with (possibly) the CCG variable for the category """ (cat_string, rest) = nextCategory(line) if cat_string.startswith('('): (res, var) = augParseCategory(cat_string[1:-1], primitives, families, var) else: # print rePrim.match(str).groups() (res, var) =\ parsePrimitiveCategory(PRIM_RE.match(cat_string).groups(), primitives, families, var) while rest != "": app = APP_RE.match(rest).groups() direction = parseApplication(app[0:3]) rest = app[3] (cat_string, rest) = nextCategory(rest) if cat_string.startswith('('): (arg, var) = augParseCategory(cat_string[1:-1], primitives, families, var) else: (arg, var) =\ parsePrimitiveCategory(PRIM_RE.match(cat_string).groups(), primitives, families, var) res = FunctionalCategory(res, arg, direction) return (res, var) def fromstring(lex_str): """ Convert string representation into a lexicon for CCGs. """ primitives = [] families = {} entries = defaultdict(list) for line in lex_str.splitlines(): # Strip comments and leading/trailing whitespace. line = COMMENTS_RE.match(line).groups()[0].strip() if line == "": continue if line.startswith(':-'): # A line of primitive categories. # The first one is the target category # ie, :- S, N, NP, VP primitives = primitives + [prim.strip() for prim in line[2:].strip().split(',')] else: # Either a family definition, or a word definition (ident, sep, catstr) = LEX_RE.match(line).groups() (cat, var) = augParseCategory(catstr, primitives, families) if sep == '::': # Family definition # ie, Det :: NP/N families[ident] = (cat, var) else: # Word definition # ie, which => (N\N)/(S/NP) entries[ident].append(cat) return CCGLexicon(primitives[0], primitives, families, entries) @deprecated('Use fromstring() instead.') def parseLexicon(lex_str): return fromstring(lex_str) openccg_tinytiny = fromstring(""" # Rather minimal lexicon based on the openccg `tinytiny' grammar. # Only incorporates a subset of the morphological subcategories, however. :- S,NP,N # Primitive categories Det :: NP/N # Determiners Pro :: NP IntransVsg :: S\\NP[sg] # Tensed intransitive verbs (singular) IntransVpl :: S\\NP[pl] # Plural TransVsg :: S\\NP[sg]/NP # Tensed transitive verbs (singular) TransVpl :: S\\NP[pl]/NP # Plural the => NP[sg]/N[sg] the => NP[pl]/N[pl] I => Pro me => Pro we => Pro us => Pro book => N[sg] books => N[pl] peach => N[sg] peaches => N[pl] policeman => N[sg] policemen => N[pl] boy => N[sg] boys => N[pl] sleep => IntransVsg sleep => IntransVpl eat => IntransVpl eat => TransVpl eats => IntransVsg eats => TransVsg see => TransVpl sees => TransVsg """) nltk-3.1/nltk/chat/0000755000076500000240000000000012610001541013731 5ustar sbstaff00000000000000nltk-3.1/nltk/chat/__init__.py0000644000076500000240000000307412607224144016062 0ustar sbstaff00000000000000# Natural Language Toolkit: Chatbots # # Copyright (C) 2001-2015 NLTK Project # Authors: Steven Bird # URL: # For license information, see LICENSE.TXT # Based on an Eliza implementation by Joe Strout , # Jeff Epler and Jez Higgins . """ A class for simple chatbots. These perform simple pattern matching on sentences typed by users, and respond with automatically generated sentences. These chatbots may not work using the windows command line or the windows IDLE GUI. """ from __future__ import print_function from nltk.chat.util import Chat from nltk.chat.eliza import eliza_chat from nltk.chat.iesha import iesha_chat from nltk.chat.rude import rude_chat from nltk.chat.suntsu import suntsu_chat from nltk.chat.zen import zen_chat bots = [ (eliza_chat, 'Eliza (psycho-babble)'), (iesha_chat, 'Iesha (teen anime junky)'), (rude_chat, 'Rude (abusive bot)'), (suntsu_chat, 'Suntsu (Chinese sayings)'), (zen_chat, 'Zen (gems of wisdom)')] def chatbots(): import sys print('Which chatbot would you like to talk to?') botcount = len(bots) for i in range(botcount): print(' %d: %s' % (i+1, bots[i][1])) while True: print('\nEnter a number in the range 1-%d: ' % botcount, end=' ') choice = sys.stdin.readline().strip() if choice.isdigit() and (int(choice) - 1) in range(botcount): break else: print(' Error: bad chatbot number') chatbot = bots[int(choice)-1][0] chatbot() nltk-3.1/nltk/chat/eliza.py0000644000076500000240000001576312607224144015437 0ustar sbstaff00000000000000# Natural Language Toolkit: Eliza # # Copyright (C) 2001-2015 NLTK Project # Authors: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT # Based on an Eliza implementation by Joe Strout , # Jeff Epler and Jez Higgins . # a translation table used to convert things you say into things the # computer says back, e.g. "I am" --> "you are" from __future__ import print_function from nltk.chat.util import Chat, reflections # a table of response pairs, where each pair consists of a # regular expression, and a list of possible responses, # with group-macros labelled as %1, %2. pairs = ( (r'I need (.*)', ( "Why do you need %1?", "Would it really help you to get %1?", "Are you sure you need %1?")), (r'Why don\'t you (.*)', ( "Do you really think I don't %1?", "Perhaps eventually I will %1.", "Do you really want me to %1?")), (r'Why can\'t I (.*)', ( "Do you think you should be able to %1?", "If you could %1, what would you do?", "I don't know -- why can't you %1?", "Have you really tried?")), (r'I can\'t (.*)', ( "How do you know you can't %1?", "Perhaps you could %1 if you tried.", "What would it take for you to %1?")), (r'I am (.*)', ( "Did you come to me because you are %1?", "How long have you been %1?", "How do you feel about being %1?")), (r'I\'m (.*)', ( "How does being %1 make you feel?", "Do you enjoy being %1?", "Why do you tell me you're %1?", "Why do you think you're %1?")), (r'Are you (.*)', ( "Why does it matter whether I am %1?", "Would you prefer it if I were not %1?", "Perhaps you believe I am %1.", "I may be %1 -- what do you think?")), (r'What (.*)', ( "Why do you ask?", "How would an answer to that help you?", "What do you think?")), (r'How (.*)', ( "How do you suppose?", "Perhaps you can answer your own question.", "What is it you're really asking?")), (r'Because (.*)', ( "Is that the real reason?", "What other reasons come to mind?", "Does that reason apply to anything else?", "If %1, what else must be true?")), (r'(.*) sorry (.*)', ( "There are many times when no apology is needed.", "What feelings do you have when you apologize?")), (r'Hello(.*)', ( "Hello... I'm glad you could drop by today.", "Hi there... how are you today?", "Hello, how are you feeling today?")), (r'I think (.*)', ( "Do you doubt %1?", "Do you really think so?", "But you're not sure %1?")), (r'(.*) friend (.*)', ( "Tell me more about your friends.", "When you think of a friend, what comes to mind?", "Why don't you tell me about a childhood friend?")), (r'Yes', ( "You seem quite sure.", "OK, but can you elaborate a bit?")), (r'(.*) computer(.*)', ( "Are you really talking about me?", "Does it seem strange to talk to a computer?", "How do computers make you feel?", "Do you feel threatened by computers?")), (r'Is it (.*)', ( "Do you think it is %1?", "Perhaps it's %1 -- what do you think?", "If it were %1, what would you do?", "It could well be that %1.")), (r'It is (.*)', ( "You seem very certain.", "If I told you that it probably isn't %1, what would you feel?")), (r'Can you (.*)', ( "What makes you think I can't %1?", "If I could %1, then what?", "Why do you ask if I can %1?")), (r'Can I (.*)', ( "Perhaps you don't want to %1.", "Do you want to be able to %1?", "If you could %1, would you?")), (r'You are (.*)', ( "Why do you think I am %1?", "Does it please you to think that I'm %1?", "Perhaps you would like me to be %1.", "Perhaps you're really talking about yourself?")), (r'You\'re (.*)', ( "Why do you say I am %1?", "Why do you think I am %1?", "Are we talking about you, or me?")), (r'I don\'t (.*)', ( "Don't you really %1?", "Why don't you %1?", "Do you want to %1?")), (r'I feel (.*)', ( "Good, tell me more about these feelings.", "Do you often feel %1?", "When do you usually feel %1?", "When you feel %1, what do you do?")), (r'I have (.*)', ( "Why do you tell me that you've %1?", "Have you really %1?", "Now that you have %1, what will you do next?")), (r'I would (.*)', ( "Could you explain why you would %1?", "Why would you %1?", "Who else knows that you would %1?")), (r'Is there (.*)', ( "Do you think there is %1?", "It's likely that there is %1.", "Would you like there to be %1?")), (r'My (.*)', ( "I see, your %1.", "Why do you say that your %1?", "When your %1, how do you feel?")), (r'You (.*)', ( "We should be discussing you, not me.", "Why do you say that about me?", "Why do you care whether I %1?")), (r'Why (.*)', ( "Why don't you tell me the reason why %1?", "Why do you think %1?" )), (r'I want (.*)', ( "What would it mean to you if you got %1?", "Why do you want %1?", "What would you do if you got %1?", "If you got %1, then what would you do?")), (r'(.*) mother(.*)', ( "Tell me more about your mother.", "What was your relationship with your mother like?", "How do you feel about your mother?", "How does this relate to your feelings today?", "Good family relations are important.")), (r'(.*) father(.*)', ( "Tell me more about your father.", "How did your father make you feel?", "How do you feel about your father?", "Does your relationship with your father relate to your feelings today?", "Do you have trouble showing affection with your family?")), (r'(.*) child(.*)', ( "Did you have close friends as a child?", "What is your favorite childhood memory?", "Do you remember any dreams or nightmares from childhood?", "Did the other children sometimes tease you?", "How do you think your childhood experiences relate to your feelings today?")), (r'(.*)\?', ( "Why do you ask that?", "Please consider whether you can answer your own question.", "Perhaps the answer lies within yourself?", "Why don't you tell me?")), (r'quit', ( "Thank you for talking with me.", "Good-bye.", "Thank you, that will be $150. Have a good day!")), (r'(.*)', ( "Please tell me more.", "Let's change focus a bit... Tell me about your family.", "Can you elaborate on that?", "Why do you say that %1?", "I see.", "Very interesting.", "%1.", "I see. And what does that tell you?", "How does that make you feel?", "How do you feel when you say that?")) ) eliza_chatbot = Chat(pairs, reflections) def eliza_chat(): print("Therapist\n---------") print("Talk to the program by typing in plain English, using normal upper-") print('and lower-case letters and punctuation. Enter "quit" when done.') print('='*72) print("Hello. How are you feeling today?") eliza_chatbot.converse() def demo(): eliza_chat() if __name__ == "__main__": demo() nltk-3.1/nltk/chat/iesha.py0000644000076500000240000000740612607224144015417 0ustar sbstaff00000000000000# Natural Language Toolkit: Teen Chatbot # # Copyright (C) 2001-2015 NLTK Project # Author: Selina Dennis # URL: # For license information, see LICENSE.TXT """ This chatbot is a tongue-in-cheek take on the average teen anime junky that frequents YahooMessenger or MSNM. All spelling mistakes and flawed grammar are intentional. """ from __future__ import print_function from nltk.chat.util import Chat reflections = { "am" : "r", "was" : "were", "i" : "u", "i'd" : "u'd", "i've" : "u'v", "ive" : "u'v", "i'll" : "u'll", "my" : "ur", "are" : "am", "you're" : "im", "you've" : "ive", "you'll" : "i'll", "your" : "my", "yours" : "mine", "you" : "me", "u" : "me", "ur" : "my", "urs" : "mine", "me" : "u" } # Note: %1/2/etc are used without spaces prior as the chat bot seems # to add a superfluous space when matching. pairs = ( (r'I\'m (.*)', ( "ur%1?? that's so cool! kekekekeke ^_^ tell me more!", "ur%1? neat!! kekeke >_<")), (r'(.*) don\'t you (.*)', ( "u think I can%2??! really?? kekeke \<_\<", "what do u mean%2??!", "i could if i wanted, don't you think!! kekeke")), (r'ye[as] [iI] (.*)', ( "u%1? cool!! how?", "how come u%1??", "u%1? so do i!!")), (r'do (you|u) (.*)\??', ( "do i%2? only on tuesdays! kekeke *_*", "i dunno! do u%2??")), (r'(.*)\?', ( "man u ask lots of questions!", "booooring! how old r u??", "boooooring!! ur not very fun")), (r'(cos|because) (.*)', ( "hee! i don't believe u! >_<", "nuh-uh! >_<", "ooooh i agree!")), (r'why can\'t [iI] (.*)', ( "i dunno! y u askin me for!", "try harder, silly! hee! ^_^", "i dunno! but when i can't%1 i jump up and down!")), (r'I can\'t (.*)', ( "u can't what??! >_<", "that's ok! i can't%1 either! kekekekeke ^_^", "try harder, silly! hee! ^&^")), (r'(.*) (like|love|watch) anime', ( "omg i love anime!! do u like sailor moon??! ^&^", "anime yay! anime rocks sooooo much!", "oooh anime! i love anime more than anything!", "anime is the bestest evar! evangelion is the best!", "hee anime is the best! do you have ur fav??")), (r'I (like|love|watch|play) (.*)', ( "yay! %2 rocks!", "yay! %2 is neat!", "cool! do u like other stuff?? ^_^")), (r'anime sucks|(.*) (hate|detest) anime', ( "ur a liar! i'm not gonna talk to u nemore if u h8 anime *;*", "no way! anime is the best ever!", "nuh-uh, anime is the best!")), (r'(are|r) (you|u) (.*)', ( "am i%1??! how come u ask that!", "maybe! y shud i tell u?? kekeke >_>")), (r'what (.*)', ( "hee u think im gonna tell u? .v.", "booooooooring! ask me somethin else!")), (r'how (.*)', ( "not tellin!! kekekekekeke ^_^",)), (r'(hi|hello|hey) (.*)', ( "hi!!! how r u!!",)), (r'quit', ( "mom says i have to go eat dinner now :,( bye!!", "awww u have to go?? see u next time!!", "how to see u again soon! ^_^")), (r'(.*)', ( "ur funny! kekeke", "boooooring! talk about something else! tell me wat u like!", "do u like anime??", "do u watch anime? i like sailor moon! ^_^", "i wish i was a kitty!! kekekeke ^_^")) ) iesha_chatbot = Chat(pairs, reflections) def iesha_chat(): print("Iesha the TeenBoT\n---------") print("Talk to the program by typing in plain English, using normal upper-") print('and lower-case letters and punctuation. Enter "quit" when done.') print('='*72) print("hi!! i'm iesha! who r u??!") iesha_chatbot.converse() def demo(): iesha_chat() if __name__ == "__main__": demo() nltk-3.1/nltk/chat/rude.py0000644000076500000240000000522412607224144015261 0ustar sbstaff00000000000000# Natural Language Toolkit: Rude Chatbot # # Copyright (C) 2001-2015 NLTK Project # Author: Peter Spiller # URL: # For license information, see LICENSE.TXT from __future__ import print_function from nltk.chat.util import Chat, reflections pairs = ( (r'We (.*)', ("What do you mean, 'we'?", "Don't include me in that!", "I wouldn't be so sure about that.")), (r'You should (.*)', ("Don't tell me what to do, buddy.", "Really? I should, should I?")), (r'You\'re(.*)', ("More like YOU'RE %1!", "Hah! Look who's talking.", "Come over here and tell me I'm %1.")), (r'You are(.*)', ("More like YOU'RE %1!", "Hah! Look who's talking.", "Come over here and tell me I'm %1.")), (r'I can\'t(.*)', ("You do sound like the type who can't %1.", "Hear that splashing sound? That's my heart bleeding for you.", "Tell somebody who might actually care.")), (r'I think (.*)', ("I wouldn't think too hard if I were you.", "You actually think? I'd never have guessed...")), (r'I (.*)', ("I'm getting a bit tired of hearing about you.", "How about we talk about me instead?", "Me, me, me... Frankly, I don't care.")), (r'How (.*)', ("How do you think?", "Take a wild guess.", "I'm not even going to dignify that with an answer.")), (r'What (.*)', ("Do I look like an encyclopedia?", "Figure it out yourself.")), (r'Why (.*)', ("Why not?", "That's so obvious I thought even you'd have already figured it out.")), (r'(.*)shut up(.*)', ("Make me.", "Getting angry at a feeble NLP assignment? Somebody's losing it.", "Say that again, I dare you.")), (r'Shut up(.*)', ("Make me.", "Getting angry at a feeble NLP assignment? Somebody's losing it.", "Say that again, I dare you.")), (r'Hello(.*)', ("Oh good, somebody else to talk to. Joy.", "'Hello'? How original...")), (r'(.*)', ("I'm getting bored here. Become more interesting.", "Either become more thrilling or get lost, buddy.", "Change the subject before I die of fatal boredom.")) ) rude_chatbot = Chat(pairs, reflections) def rude_chat(): print("Talk to the program by typing in plain English, using normal upper-") print('and lower-case letters and punctuation. Enter "quit" when done.') print('='*72) print("I suppose I should say hello.") rude_chatbot.converse() def demo(): rude_chat() if __name__ == "__main__": demo() nltk-3.1/nltk/chat/suntsu.py0000644000076500000240000001421312607224144015661 0ustar sbstaff00000000000000# Natural Language Toolkit: Sun Tsu-Bot # # Copyright (C) 2001-2015 NLTK Project # Author: Sam Huston 2007 # URL: # For license information, see LICENSE.TXT """ Tsu bot responds to all queries with a Sun Tsu sayings Quoted from Sun Tsu's The Art of War Translated by LIONEL GILES, M.A. 1910 Hosted by the Gutenberg Project http://www.gutenberg.org/ """ from __future__ import print_function from nltk.chat.util import Chat, reflections pairs = ( (r'quit', ( "Good-bye.", "Plan well", "May victory be your future")), (r'[^\?]*\?', ("Please consider whether you can answer your own question.", "Ask me no questions!")), (r'[0-9]+(.*)', ("It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.", "There are five essentials for victory")), (r'[A-Ca-c](.*)', ("The art of war is of vital importance to the State.", "All warfare is based on deception.", "If your opponent is secure at all points, be prepared for him. If he is in superior strength, evade him.", "If the campaign is protracted, the resources of the State will not be equal to the strain.", "Attack him where he is unprepared, appear where you are not expected.", "There is no instance of a country having benefited from prolonged warfare.")), (r'[D-Fd-f](.*)', ("The skillful soldier does not raise a second levy, neither are his supply-wagons loaded more than twice.", "Bring war material with you from home, but forage on the enemy.", "In war, then, let your great object be victory, not lengthy campaigns.", "To fight and conquer in all your battles is not supreme excellence; supreme excellence consists in breaking the enemy's resistance without fighting.")), (r'[G-Ig-i](.*)', ("Heaven signifies night and day, cold and heat, times and seasons.", "It is the rule in war, if our forces are ten to the enemy's one, to surround him; if five to one, to attack him; if twice as numerous, to divide our army into two.", "The good fighters of old first put themselves beyond the possibility of defeat, and then waited for an opportunity of defeating the enemy.", "One may know how to conquer without being able to do it.")), (r'[J-Lj-l](.*)', ("There are three ways in which a ruler can bring misfortune upon his army.", "By commanding the army to advance or to retreat, being ignorant of the fact that it cannot obey. This is called hobbling the army.", "By attempting to govern an army in the same way as he administers a kingdom, being ignorant of the conditions which obtain in an army. This causes restlessness in the soldier's minds.", "By employing the officers of his army without discrimination, through ignorance of the military principle of adaptation to circumstances. This shakes the confidence of the soldiers.", "There are five essentials for victory", "He will win who knows when to fight and when not to fight.", "He will win who knows how to handle both superior and inferior forces.", "He will win whose army is animated by the same spirit throughout all its ranks.", "He will win who, prepared himself, waits to take the enemy unprepared.", "He will win who has military capacity and is not interfered with by the sovereign.")), (r'[M-Om-o](.*)', ("If you know the enemy and know yourself, you need not fear the result of a hundred battles.", "If you know yourself but not the enemy, for every victory gained you will also suffer a defeat.", "If you know neither the enemy nor yourself, you will succumb in every battle.", "The control of a large force is the same principle as the control of a few men: it is merely a question of dividing up their numbers.")), (r'[P-Rp-r](.*)', ("Security against defeat implies defensive tactics; ability to defeat the enemy means taking the offensive.", "Standing on the defensive indicates insufficient strength; attacking, a superabundance of strength.", "He wins his battles by making no mistakes. Making no mistakes is what establishes the certainty of victory, for it means conquering an enemy that is already defeated.", "A victorious army opposed to a routed one, is as a pound's weight placed in the scale against a single grain.", "The onrush of a conquering force is like the bursting of pent-up waters into a chasm a thousand fathoms deep.")), (r'[S-Us-u](.*)', ("What the ancients called a clever fighter is one who not only wins, but excels in winning with ease.", "Hence his victories bring him neither reputation for wisdom nor credit for courage.", "Hence the skillful fighter puts himself into a position which makes defeat impossible, and does not miss the moment for defeating the enemy.", "In war the victorious strategist only seeks battle after the victory has been won, whereas he who is destined to defeat first fights and afterwards looks for victory.", "There are not more than five musical notes, yet the combinations of these five give rise to more melodies than can ever be heard.", "Appear at points which the enemy must hasten to defend; march swiftly to places where you are not expected.")), (r'[V-Zv-z](.*)', ("It is a matter of life and death, a road either to safety or to ruin.", "Hold out baits to entice the enemy. Feign disorder, and crush him.", "All men can see the tactics whereby I conquer, but what none can see is the strategy out of which victory is evolved.", "Do not repeat the tactics which have gained you one victory, but let your methods be regulated by the infinite variety of circumstances.", "So in war, the way is to avoid what is strong and to strike at what is weak.", "Just as water retains no constant shape, so in warfare there are no constant conditions.")), (r'(.*)', ( "Your statement insults me.", "")) ) suntsu_chatbot = Chat(pairs, reflections) def suntsu_chat(): print("Talk to the program by typing in plain English, using normal upper-") print('and lower-case letters and punctuation. Enter "quit" when done.') print('='*72) print("You seek enlightenment?") suntsu_chatbot.converse() def demo(): suntsu_chat() if __name__ == "__main__": demo() nltk-3.1/nltk/chat/util.py0000644000076500000240000000750112607224144015277 0ustar sbstaff00000000000000# Natural Language Toolkit: Chatbot Utilities # # Copyright (C) 2001-2015 NLTK Project # Authors: Steven Bird # URL: # For license information, see LICENSE.TXT # Based on an Eliza implementation by Joe Strout , # Jeff Epler and Jez Higgins . from __future__ import print_function import re import random from nltk import compat reflections = { "i am" : "you are", "i was" : "you were", "i" : "you", "i'm" : "you are", "i'd" : "you would", "i've" : "you have", "i'll" : "you will", "my" : "your", "you are" : "I am", "you were" : "I was", "you've" : "I have", "you'll" : "I will", "your" : "my", "yours" : "mine", "you" : "me", "me" : "you" } class Chat(object): def __init__(self, pairs, reflections={}): """ Initialize the chatbot. Pairs is a list of patterns and responses. Each pattern is a regular expression matching the user's statement or question, e.g. r'I like (.*)'. For each such pattern a list of possible responses is given, e.g. ['Why do you like %1', 'Did you ever dislike %1']. Material which is matched by parenthesized sections of the patterns (e.g. .*) is mapped to the numbered positions in the responses, e.g. %1. :type pairs: list of tuple :param pairs: The patterns and responses :type reflections: dict :param reflections: A mapping between first and second person expressions :rtype: None """ self._pairs = [(re.compile(x, re.IGNORECASE),y) for (x,y) in pairs] self._reflections = reflections self._regex = self._compile_reflections() def _compile_reflections(self): sorted_refl = sorted(self._reflections.keys(), key=len, reverse=True) return re.compile(r"\b({0})\b".format("|".join(map(re.escape, sorted_refl))), re.IGNORECASE) def _substitute(self, str): """ Substitute words in the string, according to the specified reflections, e.g. "I'm" -> "you are" :type str: str :param str: The string to be mapped :rtype: str """ return self._regex.sub(lambda mo: self._reflections[mo.string[mo.start():mo.end()]], str.lower()) def _wildcards(self, response, match): pos = response.find('%') while pos >= 0: num = int(response[pos+1:pos+2]) response = response[:pos] + \ self._substitute(match.group(num)) + \ response[pos+2:] pos = response.find('%') return response def respond(self, str): """ Generate a response to the user input. :type str: str :param str: The string to be mapped :rtype: str """ # check each pattern for (pattern, response) in self._pairs: match = pattern.match(str) # did the pattern match? if match: resp = random.choice(response) # pick a random response resp = self._wildcards(resp, match) # process wildcards # fix munged punctuation at the end if resp[-2:] == '?.': resp = resp[:-2] + '.' if resp[-2:] == '??': resp = resp[:-2] + '?' return resp # Hold a conversation with a chatbot def converse(self, quit="quit"): input = "" while input != quit: input = quit try: input = compat.raw_input(">") except EOFError: print(input) if input: while input[-1] in "!.": input = input[:-1] print(self.respond(input)) nltk-3.1/nltk/chat/zen.py0000644000076500000240000002341712607224144015122 0ustar sbstaff00000000000000# Natural Language Toolkit: Zen Chatbot # # Copyright (C) 2001-2015 NLTK Project # Author: Amy Holland # URL: # For license information, see LICENSE.TXT """ Zen Chatbot talks in gems of Zen wisdom. This is a sample conversation with Zen Chatbot: ZC: Welcome, my child. me: Good afternoon. ZC: Ask the question you have come to ask. me: How can I achieve enlightenment? ZC: How do you suppose? me: Through meditation. ZC: Form is emptiness, and emptiness form. me: How can I empty my mind of worldly troubles? ZC: Will an answer to that really help in your search for enlightenment? me: Yes. ZC: It is better to be right than to be certain. me: I seek truth and wisdom. ZC: The search for truth is a long journey. me: Are you sure? ZC: Maybe sure, maybe not sure. The chatbot structure is based on that of chat.eliza. Thus, it uses a translation table to convert from question to response i.e. "I am" --> "you are" Of course, since Zen Chatbot does not understand the meaning of any words, responses are very limited. Zen Chatbot will usually answer very vaguely, or respond to a question by asking a different question, in much the same way as Eliza. """ from __future__ import print_function from nltk.chat.util import Chat, reflections # responses are matched top to bottom, so non-specific matches occur later # for each match, a list of possible responses is provided responses = ( # Zen Chatbot opens with the line "Welcome, my child." The usual # response will be a greeting problem: 'good' matches "good morning", # "good day" etc, but also "good grief!" and other sentences starting # with the word 'good' that may not be a greeting (r'(hello(.*))|(good [a-zA-Z]+)', ( "The path to enlightenment is often difficult to see.", "Greetings. I sense your mind is troubled. Tell me of your troubles.", "Ask the question you have come to ask.", "Hello. Do you seek englightenment?")), # "I need" and "I want" can be followed by a thing (eg 'help') # or an action (eg 'to see you') # # This is a problem with this style of response - # person: "I need you" # chatbot: "me can be achieved by hard work and dedication of the mind" # i.e. 'you' is not really a thing that can be mapped this way, so this # interpretation only makes sense for some inputs # (r'i need (.*)', ( "%1 can be achieved by hard work and dedication of the mind.", "%1 is not a need, but a desire of the mind. Clear your mind of such concerns.", "Focus your mind on%1, and you will find what you need.")), (r'i want (.*)', ( "Desires of the heart will distract you from the path to enlightenment.", "Will%1 help you attain enlightenment?", "Is%1 a desire of the mind, or of the heart?")), # why questions are separated into three types: # "why..I" e.g. "why am I here?" "Why do I like cake?" # "why..you" e.g. "why are you here?" "Why won't you tell me?" # "why..." e.g. "Why is the sky blue?" # problems: # person: "Why can't you tell me?" # chatbot: "Are you sure I tell you?" # - this style works for positives (e.g. "why do you like cake?") # but does not work for negatives (e.g. "why don't you like cake?") (r'why (.*) i (.*)\?', ( "You%1%2?", "Perhaps you only think you%1%2")), (r'why (.*) you(.*)\?', ( "Why%1 you%2?", "%2 I%1", "Are you sure I%2?")), (r'why (.*)\?', ( "I cannot tell you why%1.", "Why do you think %1?" )), # e.g. "are you listening?", "are you a duck" (r'are you (.*)\?', ( "Maybe%1, maybe not%1.", "Whether I am%1 or not is God's business.")), # e.g. "am I a duck?", "am I going to die?" (r'am i (.*)\?', ( "Perhaps%1, perhaps not%1.", "Whether you are%1 or not is not for me to say.")), # what questions, e.g. "what time is it?" # problems: # person: "What do you want?" # chatbot: "Seek truth, not what do me want." (r'what (.*)\?', ( "Seek truth, not what%1.", "What%1 should not concern you.")), # how questions, e.g. "how do you do?" (r'how (.*)\?', ( "How do you suppose?", "Will an answer to that really help in your search for enlightenment?", "Ask yourself not how, but why.")), # can questions, e.g. "can you run?", "can you come over here please?" (r'can you (.*)\?', ( "I probably can, but I may not.", "Maybe I can%1, and maybe I cannot.", "I can do all, and I can do nothing.")), # can questions, e.g. "can I have some cake?", "can I know truth?" (r'can i (.*)\?', ( "You can%1 if you believe you can%1, and have a pure spirit.", "Seek truth and you will know if you can%1.")), # e.g. "It is raining" - implies the speaker is certain of a fact (r'it is (.*)', ( "How can you be certain that%1, when you do not even know yourself?", "Whether it is%1 or not does not change the way the world is.")), # e.g. "is there a doctor in the house?" (r'is there (.*)\?', ( "There is%1 if you believe there is.", "It is possible that there is%1.")), # e.g. "is it possible?", "is this true?" (r'is(.*)\?', ( "%1 is not relevant.", "Does this matter?")), # non-specific question (r'(.*)\?', ( "Do you think %1?", "You seek the truth. Does the truth seek you?", "If you intentionally pursue the answers to your questions, the answers become hard to see.", "The answer to your question cannot be told. It must be experienced.")), # expression of hate of form "I hate you" or "Kelly hates cheese" (r'(.*) (hate[s]?)|(dislike[s]?)|(don\'t like)(.*)', ( "Perhaps it is not about hating %2, but about hate from within.", "Weeds only grow when we dislike them", "Hate is a very strong emotion.")), # statement containing the word 'truth' (r'(.*) truth(.*)', ( "Seek truth, and truth will seek you.", "Remember, it is not the spoon which bends - only yourself.", "The search for truth is a long journey.")), # desire to do an action # e.g. "I want to go shopping" (r'i want to (.*)', ( "You may %1 if your heart truly desires to.", "You may have to %1.")), # desire for an object # e.g. "I want a pony" (r'i want (.*)', ( "Does your heart truly desire %1?", "Is this a desire of the heart, or of the mind?")), # e.g. "I can't wait" or "I can't do this" (r'i can\'t (.*)', ( "What we can and can't do is a limitation of the mind.", "There are limitations of the body, and limitations of the mind.", "Have you tried to%1 with a clear mind?")), # "I think.." indicates uncertainty. e.g. "I think so." # problem: exceptions... # e.g. "I think, therefore I am" (r'i think (.*)', ( "Uncertainty in an uncertain world.", "Indeed, how can we be certain of anything in such uncertain times.", "Are you not, in fact, certain that%1?")), # "I feel...emotions/sick/light-headed..." (r'i feel (.*)', ( "Your body and your emotions are both symptoms of your mind." "What do you believe is the root of such feelings?", "Feeling%1 can be a sign of your state-of-mind.")), # exclaimation mark indicating emotion # e.g. "Wow!" or "No!" (r'(.*)!', ( "I sense that you are feeling emotional today.", "You need to calm your emotions.")), # because [statement] # e.g. "because I said so" (r'because (.*)', ( "Does knowning the reasons behind things help you to understand" " the things themselves?", "If%1, what else must be true?")), # yes or no - raise an issue of certainty/correctness (r'(yes)|(no)', ( "Is there certainty in an uncertain world?", "It is better to be right than to be certain.")), # sentence containing word 'love' (r'(.*)love(.*)', ( "Think of the trees: they let the birds perch and fly with no intention to call them when they come, and no longing for their return when they fly away. Let your heart be like the trees.", "Free love!")), # sentence containing word 'understand' - r (r'(.*)understand(.*)', ( "If you understand, things are just as they are;" " if you do not understand, things are just as they are.", "Imagination is more important than knowledge.")), # 'I', 'me', 'my' - person is talking about themself. # this breaks down when words contain these - eg 'Thyme', 'Irish' (r'(.*)(me )|( me)|(my)|(mine)|(i)(.*)', ( "'I', 'me', 'my'... these are selfish expressions.", "Have you ever considered that you might be a selfish person?", "Try to consider others, not just yourself.", "Think not just of yourself, but of others.")), # 'you' starting a sentence # e.g. "you stink!" (r'you (.*)', ( "My path is not of conern to you.", "I am but one, and you but one more.")), # say goodbye with some extra Zen wisdom. (r'exit', ( "Farewell. The obstacle is the path.", "Farewell. Life is a journey, not a destination.", "Good bye. We are cups, constantly and quietly being filled." "\nThe trick is knowning how to tip ourselves over and let the beautiful stuff out.")), # fall through case - # when stumped, respond with generic zen wisdom # (r'(.*)', ( "When you're enlightened, every word is wisdom.", "Random talk is useless.", "The reverse side also has a reverse side.", "Form is emptiness, and emptiness is form.", "I pour out a cup of water. Is the cup empty?")) ) zen_chatbot = Chat(responses, reflections) def zen_chat(): print('*'*75) print("Zen Chatbot!".center(75)) print('*'*75) print('"Look beyond mere words and letters - look into your mind"'.center(75)) print("* Talk your way to truth with Zen Chatbot.") print("* Type 'quit' when you have had enough.") print('*'*75) print("Welcome, my child.") zen_chatbot.converse() def demo(): zen_chat() if __name__ == "__main__": demo() nltk-3.1/nltk/chunk/0000755000076500000240000000000012610001541014122 5ustar sbstaff00000000000000nltk-3.1/nltk/chunk/__init__.py0000644000076500000240000001641212607224144016253 0ustar sbstaff00000000000000# Natural Language Toolkit: Chunkers # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT # """ Classes and interfaces for identifying non-overlapping linguistic groups (such as base noun phrases) in unrestricted text. This task is called "chunk parsing" or "chunking", and the identified groups are called "chunks". The chunked text is represented using a shallow tree called a "chunk structure." A chunk structure is a tree containing tokens and chunks, where each chunk is a subtree containing only tokens. For example, the chunk structure for base noun phrase chunks in the sentence "I saw the big dog on the hill" is:: (SENTENCE: (NP: ) (NP: ) (NP: )) To convert a chunk structure back to a list of tokens, simply use the chunk structure's ``leaves()`` method. This module defines ``ChunkParserI``, a standard interface for chunking texts; and ``RegexpChunkParser``, a regular-expression based implementation of that interface. It also defines ``ChunkScore``, a utility class for scoring chunk parsers. RegexpChunkParser ================= ``RegexpChunkParser`` is an implementation of the chunk parser interface that uses regular-expressions over tags to chunk a text. Its ``parse()`` method first constructs a ``ChunkString``, which encodes a particular chunking of the input text. Initially, nothing is chunked. ``parse.RegexpChunkParser`` then applies a sequence of ``RegexpChunkRule`` rules to the ``ChunkString``, each of which modifies the chunking that it encodes. Finally, the ``ChunkString`` is transformed back into a chunk structure, which is returned. ``RegexpChunkParser`` can only be used to chunk a single kind of phrase. For example, you can use an ``RegexpChunkParser`` to chunk the noun phrases in a text, or the verb phrases in a text; but you can not use it to simultaneously chunk both noun phrases and verb phrases in the same text. (This is a limitation of ``RegexpChunkParser``, not of chunk parsers in general.) RegexpChunkRules ---------------- A ``RegexpChunkRule`` is a transformational rule that updates the chunking of a text by modifying its ``ChunkString``. Each ``RegexpChunkRule`` defines the ``apply()`` method, which modifies the chunking encoded by a ``ChunkString``. The ``RegexpChunkRule`` class itself can be used to implement any transformational rule based on regular expressions. There are also a number of subclasses, which can be used to implement simpler types of rules: - ``ChunkRule`` chunks anything that matches a given regular expression. - ``ChinkRule`` chinks anything that matches a given regular expression. - ``UnChunkRule`` will un-chunk any chunk that matches a given regular expression. - ``MergeRule`` can be used to merge two contiguous chunks. - ``SplitRule`` can be used to split a single chunk into two smaller chunks. - ``ExpandLeftRule`` will expand a chunk to incorporate new unchunked material on the left. - ``ExpandRightRule`` will expand a chunk to incorporate new unchunked material on the right. Tag Patterns ~~~~~~~~~~~~ A ``RegexpChunkRule`` uses a modified version of regular expression patterns, called "tag patterns". Tag patterns are used to match sequences of tags. Examples of tag patterns are:: r'(
    ||)+' r'+' r'' The differences between regular expression patterns and tag patterns are: - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so ``'+'`` matches one or more repetitions of ``''``, not ``''``. - Whitespace in tag patterns is ignored. So ``'
    | '`` is equivalant to ``'
    |'`` - In tag patterns, ``'.'`` is equivalant to ``'[^{}<>]'``; so ``''`` matches any single tag starting with ``'NN'``. The function ``tag_pattern2re_pattern`` can be used to transform a tag pattern to an equivalent regular expression pattern. Efficiency ---------- Preliminary tests indicate that ``RegexpChunkParser`` can chunk at a rate of about 300 tokens/second, with a moderately complex rule set. There may be problems if ``RegexpChunkParser`` is used with more than 5,000 tokens at a time. In particular, evaluation of some regular expressions may cause the Python regular expression engine to exceed its maximum recursion depth. We have attempted to minimize these problems, but it is impossible to avoid them completely. We therefore recommend that you apply the chunk parser to a single sentence at a time. Emacs Tip --------- If you evaluate the following elisp expression in emacs, it will colorize a ``ChunkString`` when you use an interactive python shell with emacs or xemacs ("C-c !"):: (let () (defconst comint-mode-font-lock-keywords '(("<[^>]+>" 0 'font-lock-reference-face) ("[{}]" 0 'font-lock-function-name-face))) (add-hook 'comint-mode-hook (lambda () (turn-on-font-lock)))) You can evaluate this code by copying it to a temporary buffer, placing the cursor after the last close parenthesis, and typing "``C-x C-e``". You should evaluate it before running the interactive session. The change will last until you close emacs. Unresolved Issues ----------------- If we use the ``re`` module for regular expressions, Python's regular expression engine generates "maximum recursion depth exceeded" errors when processing very large texts, even for regular expressions that should not require any recursion. We therefore use the ``pre`` module instead. But note that ``pre`` does not include Unicode support, so this module will not work with unicode strings. Note also that ``pre`` regular expressions are not quite as advanced as ``re`` ones (e.g., no leftward zero-length assertions). :type CHUNK_TAG_PATTERN: regexp :var CHUNK_TAG_PATTERN: A regular expression to test whether a tag pattern is valid. """ from nltk.data import load from nltk.chunk.api import ChunkParserI from nltk.chunk.util import (ChunkScore, accuracy, tagstr2tree, conllstr2tree, conlltags2tree, tree2conlltags, tree2conllstr, tree2conlltags, ieerstr2tree) from nltk.chunk.regexp import RegexpChunkParser, RegexpParser # Standard treebank POS tagger _BINARY_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_binary.pickle' _MULTICLASS_NE_CHUNKER = 'chunkers/maxent_ne_chunker/english_ace_multiclass.pickle' def ne_chunk(tagged_tokens, binary=False): """ Use NLTK's currently recommended named entity chunker to chunk the given list of tagged tokens. """ if binary: chunker_pickle = _BINARY_NE_CHUNKER else: chunker_pickle = _MULTICLASS_NE_CHUNKER chunker = load(chunker_pickle) return chunker.parse(tagged_tokens) def ne_chunk_sents(tagged_sentences, binary=False): """ Use NLTK's currently recommended named entity chunker to chunk the given list of tagged sentences, each consisting of a list of tagged tokens. """ if binary: chunker_pickle = _BINARY_NE_CHUNKER else: chunker_pickle = _MULTICLASS_NE_CHUNKER chunker = load(chunker_pickle) return chunker.parse_sents(tagged_sentences) nltk-3.1/nltk/chunk/api.py0000644000076500000240000000330712607224144015264 0ustar sbstaff00000000000000# Natural Language Toolkit: Chunk parsing API # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT ##////////////////////////////////////////////////////// ## Chunk Parser Interface ##////////////////////////////////////////////////////// from nltk.parse import ParserI from nltk.chunk.util import ChunkScore class ChunkParserI(ParserI): """ A processing interface for identifying non-overlapping groups in unrestricted text. Typically, chunk parsers are used to find base syntactic constituents, such as base noun phrases. Unlike ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method will always generate a parse. """ def parse(self, tokens): """ Return the best chunk structure for the given tokens and return a tree. :param tokens: The list of (word, tag) tokens to be chunked. :type tokens: list(tuple) :rtype: Tree """ raise NotImplementedError() def evaluate(self, gold): """ Score the accuracy of the chunker against the gold standard. Remove the chunking the gold standard text, rechunk it using the chunker, and return a ``ChunkScore`` object reflecting the performance of this chunk peraser. :type gold: list(Tree) :param gold: The list of chunked sentences to score the chunker on. :rtype: ChunkScore """ chunkscore = ChunkScore() for correct in gold: chunkscore.score(correct, self.parse(correct.leaves())) return chunkscore nltk-3.1/nltk/chunk/named_entity.py0000644000076500000240000002511512607224144017174 0ustar sbstaff00000000000000# Natural Language Toolkit: Chunk parsing API # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Named entity chunker """ from __future__ import print_function import os, re, pickle from xml.etree import ElementTree as ET from nltk.tag import ClassifierBasedTagger, pos_tag try: from nltk.classify import MaxentClassifier except ImportError: pass from nltk.tree import Tree from nltk.tokenize import word_tokenize from nltk.data import find from nltk.chunk.api import ChunkParserI from nltk.chunk.util import ChunkScore class NEChunkParserTagger(ClassifierBasedTagger): """ The IOB tagger used by the chunk parser. """ def __init__(self, train): ClassifierBasedTagger.__init__( self, train=train, classifier_builder=self._classifier_builder) def _classifier_builder(self, train): return MaxentClassifier.train(train, algorithm='megam', gaussian_prior_sigma=1, trace=2) def _english_wordlist(self): try: wl = self._en_wordlist except AttributeError: from nltk.corpus import words self._en_wordlist = set(words.words('en-basic')) wl = self._en_wordlist return wl def _feature_detector(self, tokens, index, history): word = tokens[index][0] pos = simplify_pos(tokens[index][1]) if index == 0: prevword = prevprevword = None prevpos = prevprevpos = None prevshape = prevtag = prevprevtag = None elif index == 1: prevword = tokens[index-1][0].lower() prevprevword = None prevpos = simplify_pos(tokens[index-1][1]) prevprevpos = None prevtag = history[index-1][0] prevshape = prevprevtag = None else: prevword = tokens[index-1][0].lower() prevprevword = tokens[index-2][0].lower() prevpos = simplify_pos(tokens[index-1][1]) prevprevpos = simplify_pos(tokens[index-2][1]) prevtag = history[index-1] prevprevtag = history[index-2] prevshape = shape(prevword) if index == len(tokens)-1: nextword = nextnextword = None nextpos = nextnextpos = None elif index == len(tokens)-2: nextword = tokens[index+1][0].lower() nextpos = tokens[index+1][1].lower() nextnextword = None nextnextpos = None else: nextword = tokens[index+1][0].lower() nextpos = tokens[index+1][1].lower() nextnextword = tokens[index+2][0].lower() nextnextpos = tokens[index+2][1].lower() # 89.6 features = { 'bias': True, 'shape': shape(word), 'wordlen': len(word), 'prefix3': word[:3].lower(), 'suffix3': word[-3:].lower(), 'pos': pos, 'word': word, 'en-wordlist': (word in self._english_wordlist()), 'prevtag': prevtag, 'prevpos': prevpos, 'nextpos': nextpos, 'prevword': prevword, 'nextword': nextword, 'word+nextpos': '%s+%s' % (word.lower(), nextpos), 'pos+prevtag': '%s+%s' % (pos, prevtag), 'shape+prevtag': '%s+%s' % (prevshape, prevtag), } return features class NEChunkParser(ChunkParserI): """ Expected input: list of pos-tagged words """ def __init__(self, train): self._train(train) def parse(self, tokens): """ Each token should be a pos-tagged word """ tagged = self._tagger.tag(tokens) tree = self._tagged_to_parse(tagged) return tree def _train(self, corpus): # Convert to tagged sequence corpus = [self._parse_to_tagged(s) for s in corpus] self._tagger = NEChunkParserTagger(train=corpus) def _tagged_to_parse(self, tagged_tokens): """ Convert a list of tagged tokens to a chunk-parse tree. """ sent = Tree('S', []) for (tok,tag) in tagged_tokens: if tag == 'O': sent.append(tok) elif tag.startswith('B-'): sent.append(Tree(tag[2:], [tok])) elif tag.startswith('I-'): if (sent and isinstance(sent[-1], Tree) and sent[-1].label() == tag[2:]): sent[-1].append(tok) else: sent.append(Tree(tag[2:], [tok])) return sent @staticmethod def _parse_to_tagged(sent): """ Convert a chunk-parse tree to a list of tagged tokens. """ toks = [] for child in sent: if isinstance(child, Tree): if len(child) == 0: print("Warning -- empty chunk in sentence") continue toks.append((child[0], 'B-%s' % child.label())) for tok in child[1:]: toks.append((tok, 'I-%s' % child.label())) else: toks.append((child, 'O')) return toks def shape(word): if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word, re.UNICODE): return 'number' elif re.match('\W+$', word, re.UNICODE): return 'punct' elif re.match('\w+$', word, re.UNICODE): if word.istitle(): return 'upcase' elif word.islower(): return 'downcase' else: return 'mixedcase' else: return 'other' def simplify_pos(s): if s.startswith('V'): return "V" else: return s.split('-')[0] def postag_tree(tree): # Part-of-speech tagging. words = tree.leaves() tag_iter = (pos for (word, pos) in pos_tag(words)) newtree = Tree('S', []) for child in tree: if isinstance(child, Tree): newtree.append(Tree(child.label(), [])) for subchild in child: newtree[-1].append( (subchild, next(tag_iter)) ) else: newtree.append( (child, next(tag_iter)) ) return newtree def load_ace_data(roots, fmt='binary', skip_bnews=True): for root in roots: for root, dirs, files in os.walk(root): if root.endswith('bnews') and skip_bnews: continue for f in files: if f.endswith('.sgm'): for sent in load_ace_file(os.path.join(root, f), fmt): yield sent def load_ace_file(textfile, fmt): print(' - %s' % os.path.split(textfile)[1]) annfile = textfile+'.tmx.rdc.xml' # Read the xml file, and get a list of entities entities = [] with open(annfile, 'r') as infile: xml = ET.parse(infile).getroot() for entity in xml.findall('document/entity'): typ = entity.find('entity_type').text for mention in entity.findall('entity_mention'): if mention.get('TYPE') != 'NAME': continue # only NEs s = int(mention.find('head/charseq/start').text) e = int(mention.find('head/charseq/end').text)+1 entities.append( (s, e, typ) ) # Read the text file, and mark the entities. with open(textfile, 'r') as infile: text = infile.read() # Strip XML tags, since they don't count towards the indices text = re.sub('<(?!/?TEXT)[^>]+>', '', text) # Blank out anything before/after def subfunc(m): return ' '*(m.end()-m.start()-6) text = re.sub('[\s\S]*', subfunc, text) text = re.sub('[\s\S]*', '', text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s,e,typ) in entities) # Binary distinction (NE or not NE) if fmt == 'binary': i = 0 toks = Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree('NE', text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == 'multiclass': i = 0 toks = Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(word_tokenize(text[i:s])) toks.append(Tree(typ, text[s:e].split())) i = e toks.extend(word_tokenize(text[i:])) yield toks else: raise ValueError('bad fmt value') # This probably belongs in a more general-purpose location (as does # the parse_to_tagged function). def cmp_chunks(correct, guessed): correct = NEChunkParser._parse_to_tagged(correct) guessed = NEChunkParser._parse_to_tagged(guessed) ellipsis = False for (w, ct), (w, gt) in zip(correct, guessed): if ct == gt == 'O': if not ellipsis: print(" %-15s %-15s %s" % (ct, gt, w)) print(' %-15s %-15s %s' % ('...', '...', '...')) ellipsis = True else: ellipsis = False print(" %-15s %-15s %s" % (ct, gt, w)) def build_model(fmt='binary'): print('Loading training data...') train_paths = [find('corpora/ace_data/ace.dev'), find('corpora/ace_data/ace.heldout'), find('corpora/ace_data/bbn.dev'), find('corpora/ace_data/muc.dev')] train_trees = load_ace_data(train_paths, fmt) train_data = [postag_tree(t) for t in train_trees] print('Training...') cp = NEChunkParser(train_data) del train_data print('Loading eval data...') eval_paths = [find('corpora/ace_data/ace.eval')] eval_trees = load_ace_data(eval_paths, fmt) eval_data = [postag_tree(t) for t in eval_trees] print('Evaluating...') chunkscore = ChunkScore() for i, correct in enumerate(eval_data): guess = cp.parse(correct.leaves()) chunkscore.score(correct, guess) if i < 3: cmp_chunks(correct, guess) print(chunkscore) outfilename = '/tmp/ne_chunker_%s.pickle' % fmt print('Saving chunker to %s...' % outfilename) with open(outfilename, 'wb') as outfile: pickle.dump(cp, outfile, -1) return cp if __name__ == '__main__': # Make sure that the pickled object has the right class name: from nltk.chunk.named_entity import build_model build_model('binary') build_model('multiclass') nltk-3.1/nltk/chunk/regexp.py0000644000076500000240000015225012607224144016007 0ustar sbstaff00000000000000# Natural Language Toolkit: Regular Expression Chunkers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals from __future__ import division import re from nltk.tree import Tree from nltk.chunk.api import ChunkParserI from nltk.compat import python_2_unicode_compatible, string_types, unicode_repr ##////////////////////////////////////////////////////// ## ChunkString ##////////////////////////////////////////////////////// @python_2_unicode_compatible class ChunkString(object): """ A string-based encoding of a particular chunking of a text. Internally, the ``ChunkString`` class uses a single string to encode the chunking of the input text. This string contains a sequence of angle-bracket delimited tags, with chunking indicated by braces. An example of this encoding is:: {
    }{
    }<.>{
    }<.> ``ChunkString`` are created from tagged texts (i.e., lists of ``tokens`` whose type is ``TaggedType``). Initially, nothing is chunked. The chunking of a ``ChunkString`` can be modified with the ``xform()`` method, which uses a regular expression to transform the string representation. These transformations should only add and remove braces; they should *not* modify the sequence of angle-bracket delimited tags. :type _str: str :ivar _str: The internal string representation of the text's encoding. This string representation contains a sequence of angle-bracket delimited tags, with chunking indicated by braces. An example of this encoding is:: {
    }{
    }<.>{
    }<.> :type _pieces: list(tagged tokens and chunks) :ivar _pieces: The tagged tokens and chunks encoded by this ``ChunkString``. :ivar _debug: The debug level. See the constructor docs. :cvar IN_CHUNK_PATTERN: A zero-width regexp pattern string that will only match positions that are in chunks. :cvar IN_CHINK_PATTERN: A zero-width regexp pattern string that will only match positions that are in chinks. """ CHUNK_TAG_CHAR = r'[^\{\}<>]' CHUNK_TAG = r'(<%s+?>)' % CHUNK_TAG_CHAR IN_CHUNK_PATTERN = r'(?=[^\{]*\})' IN_CHINK_PATTERN = r'(?=[^\}]*(\{|$))' # These are used by _verify _CHUNK = r'(\{%s+?\})+?' % CHUNK_TAG _CHINK = r'(%s+?)+?' % CHUNK_TAG _VALID = re.compile(r'^(\{?%s\}?)*?$' % CHUNK_TAG) _BRACKETS = re.compile('[^\{\}]+') _BALANCED_BRACKETS = re.compile(r'(\{\})*$') def __init__(self, chunk_struct, debug_level=1): """ Construct a new ``ChunkString`` that encodes the chunking of the text ``tagged_tokens``. :type chunk_struct: Tree :param chunk_struct: The chunk structure to be further chunked. :type debug_level: int :param debug_level: The level of debugging which should be applied to transformations on the ``ChunkString``. The valid levels are: - 0: no checks - 1: full check on to_chunkstruct - 2: full check on to_chunkstruct and cursory check after each transformation. - 3: full check on to_chunkstruct and full check after each transformation. We recommend you use at least level 1. You should probably use level 3 if you use any non-standard subclasses of ``RegexpChunkRule``. """ self._root_label = chunk_struct.label() self._pieces = chunk_struct[:] tags = [self._tag(tok) for tok in self._pieces] self._str = '<' + '><'.join(tags) + '>' self._debug = debug_level def _tag(self, tok): if isinstance(tok, tuple): return tok[1] elif isinstance(tok, Tree): return tok.label() else: raise ValueError('chunk structures must contain tagged ' 'tokens or trees') def _verify(self, s, verify_tags): """ Check to make sure that ``s`` still corresponds to some chunked version of ``_pieces``. :type verify_tags: bool :param verify_tags: Whether the individual tags should be checked. If this is false, ``_verify`` will check to make sure that ``_str`` encodes a chunked version of *some* list of tokens. If this is true, then ``_verify`` will check to make sure that the tags in ``_str`` match those in ``_pieces``. :raise ValueError: if the internal string representation of this ``ChunkString`` is invalid or not consistent with _pieces. """ # Check overall form if not ChunkString._VALID.match(s): raise ValueError('Transformation generated invalid ' 'chunkstring:\n %s' % s) # Check that parens are balanced. If the string is long, we # have to do this in pieces, to avoid a maximum recursion # depth limit for regular expressions. brackets = ChunkString._BRACKETS.sub('', s) for i in range(1 + len(brackets) // 5000): substr = brackets[i*5000:i*5000+5000] if not ChunkString._BALANCED_BRACKETS.match(substr): raise ValueError('Transformation generated invalid ' 'chunkstring:\n %s' % s) if verify_tags<=0: return tags1 = (re.split(r'[\{\}<>]+', s))[1:-1] tags2 = [self._tag(piece) for piece in self._pieces] if tags1 != tags2: raise ValueError('Transformation generated invalid ' 'chunkstring: tag changed') def to_chunkstruct(self, chunk_label='CHUNK'): """ Return the chunk structure encoded by this ``ChunkString``. :rtype: Tree :raise ValueError: If a transformation has generated an invalid chunkstring. """ if self._debug > 0: self._verify(self._str, 1) # Use this alternating list to create the chunkstruct. pieces = [] index = 0 piece_in_chunk = 0 for piece in re.split('[{}]', self._str): # Find the list of tokens contained in this piece. length = piece.count('<') subsequence = self._pieces[index:index+length] # Add this list of tokens to our pieces. if piece_in_chunk: pieces.append(Tree(chunk_label, subsequence)) else: pieces += subsequence # Update index, piece_in_chunk index += length piece_in_chunk = not piece_in_chunk return Tree(self._root_label, pieces) def xform(self, regexp, repl): """ Apply the given transformation to the string encoding of this ``ChunkString``. In particular, find all occurrences that match ``regexp``, and replace them using ``repl`` (as done by ``re.sub``). This transformation should only add and remove braces; it should *not* modify the sequence of angle-bracket delimited tags. Furthermore, this transformation may not result in improper bracketing. Note, in particular, that bracketing may not be nested. :type regexp: str or regexp :param regexp: A regular expression matching the substring that should be replaced. This will typically include a named group, which can be used by ``repl``. :type repl: str :param repl: An expression specifying what should replace the matched substring. Typically, this will include a named replacement group, specified by ``regexp``. :rtype: None :raise ValueError: If this transformation generated an invalid chunkstring. """ # Do the actual substitution s = re.sub(regexp, repl, self._str) # The substitution might have generated "empty chunks" # (substrings of the form "{}"). Remove them, so they don't # interfere with other transformations. s = re.sub('\{\}', '', s) # Make sure that the transformation was legal. if self._debug > 1: self._verify(s, self._debug-2) # Commit the transformation. self._str = s def __repr__(self): """ Return a string representation of this ``ChunkString``. It has the form:: }{
    }'> :rtype: str """ return '' % unicode_repr(self._str) def __str__(self): """ Return a formatted representation of this ``ChunkString``. This representation will include extra spaces to ensure that tags will line up with the representation of other ``ChunkStrings`` for the same text, regardless of the chunking. :rtype: str """ # Add spaces to make everything line up. str = re.sub(r'>(?!\})', r'> ', self._str) str = re.sub(r'([^\{])<', r'\1 <', str) if str[0] == '<': str = ' ' + str return str ##////////////////////////////////////////////////////// ## Chunking Rules ##////////////////////////////////////////////////////// @python_2_unicode_compatible class RegexpChunkRule(object): """ A rule specifying how to modify the chunking in a ``ChunkString``, using a transformational regular expression. The ``RegexpChunkRule`` class itself can be used to implement any transformational rule based on regular expressions. There are also a number of subclasses, which can be used to implement simpler types of rules, based on matching regular expressions. Each ``RegexpChunkRule`` has a regular expression and a replacement expression. When a ``RegexpChunkRule`` is "applied" to a ``ChunkString``, it searches the ``ChunkString`` for any substring that matches the regular expression, and replaces it using the replacement expression. This search/replace operation has the same semantics as ``re.sub``. Each ``RegexpChunkRule`` also has a description string, which gives a short (typically less than 75 characters) description of the purpose of the rule. This transformation defined by this ``RegexpChunkRule`` should only add and remove braces; it should *not* modify the sequence of angle-bracket delimited tags. Furthermore, this transformation may not result in nested or mismatched bracketing. """ def __init__(self, regexp, repl, descr): """ Construct a new RegexpChunkRule. :type regexp: regexp or str :param regexp: The regular expression for this ``RegexpChunkRule``. When this rule is applied to a ``ChunkString``, any substring that matches ``regexp`` will be replaced using the replacement string ``repl``. Note that this must be a normal regular expression, not a tag pattern. :type repl: str :param repl: The replacement expression for this ``RegexpChunkRule``. When this rule is applied to a ``ChunkString``, any substring that matches ``regexp`` will be replaced using ``repl``. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ if isinstance(regexp, string_types): regexp = re.compile(regexp) self._repl = repl self._descr = descr self._regexp = regexp def apply(self, chunkstr): # Keep docstring generic so we can inherit it. """ Apply this rule to the given ``ChunkString``. See the class reference documentation for a description of what it means to apply a rule. :type chunkstr: ChunkString :param chunkstr: The chunkstring to which this rule is applied. :rtype: None :raise ValueError: If this transformation generated an invalid chunkstring. """ chunkstr.xform(self._regexp, self._repl) def descr(self): """ Return a short description of the purpose and/or effect of this rule. :rtype: str """ return self._descr def __repr__(self): """ Return a string representation of this rule. It has the form:: }'->''> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return (''+unicode_repr(self._repl)+'>') @staticmethod def fromstring(s): """ Create a RegexpChunkRule from a string description. Currently, the following formats are supported:: {regexp} # chunk rule }regexp{ # chink rule regexp}{regexp # split rule regexp{}regexp # merge rule Where ``regexp`` is a regular expression for the rule. Any text following the comment marker (``#``) will be used as the rule's description: >>> from nltk.chunk.regexp import RegexpChunkRule >>> RegexpChunkRule.fromstring('{
    ?+}') ?+'> """ # Split off the comment (but don't split on '\#') m = re.match(r'(?P(\\.|[^#])*)(?P#.*)?', s) rule = m.group('rule').strip() comment = (m.group('comment') or '')[1:].strip() # Pattern bodies: chunk, chink, split, merge try: if not rule: raise ValueError('Empty chunk pattern') if rule[0] == '{' and rule[-1] == '}': return ChunkRule(rule[1:-1], comment) elif rule[0] == '}' and rule[-1] == '{': return ChinkRule(rule[1:-1], comment) elif '}{' in rule: left, right = rule.split('}{') return SplitRule(left, right, comment) elif '{}' in rule: left, right = rule.split('{}') return MergeRule(left, right, comment) elif re.match('[^{}]*{[^{}]*}[^{}]*', rule): left, chunk, right = re.split('[{}]', rule) return ChunkRuleWithContext(left, chunk, right, comment) else: raise ValueError('Illegal chunk pattern: %s' % rule) except (ValueError, re.error): raise ValueError('Illegal chunk pattern: %s' % rule) @python_2_unicode_compatible class ChunkRule(RegexpChunkRule): """ A rule specifying how to add chunks to a ``ChunkString``, using a matching tag pattern. When applied to a ``ChunkString``, it will find any substring that matches this tag pattern and that is not already part of a chunk, and create a new chunk containing that substring. """ def __init__(self, tag_pattern, descr): """ Construct a new ``ChunkRule``. :type tag_pattern: str :param tag_pattern: This rule's tag pattern. When applied to a ``ChunkString``, this rule will chunk any substring that matches this tag pattern and that is not already part of a chunk. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ self._pattern = tag_pattern regexp = re.compile('(?P%s)%s' % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHINK_PATTERN)) RegexpChunkRule.__init__(self, regexp, '{\g}', descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return '' @python_2_unicode_compatible class ChinkRule(RegexpChunkRule): """ A rule specifying how to remove chinks to a ``ChunkString``, using a matching tag pattern. When applied to a ``ChunkString``, it will find any substring that matches this tag pattern and that is contained in a chunk, and remove it from that chunk, thus creating two new chunks. """ def __init__(self, tag_pattern, descr): """ Construct a new ``ChinkRule``. :type tag_pattern: str :param tag_pattern: This rule's tag pattern. When applied to a ``ChunkString``, this rule will find any substring that matches this tag pattern and that is contained in a chunk, and remove it from that chunk, thus creating two new chunks. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ self._pattern = tag_pattern regexp = re.compile('(?P%s)%s' % (tag_pattern2re_pattern(tag_pattern), ChunkString.IN_CHUNK_PATTERN)) RegexpChunkRule.__init__(self, regexp, '}\g{', descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return '' @python_2_unicode_compatible class UnChunkRule(RegexpChunkRule): """ A rule specifying how to remove chunks to a ``ChunkString``, using a matching tag pattern. When applied to a ``ChunkString``, it will find any complete chunk that matches this tag pattern, and un-chunk it. """ def __init__(self, tag_pattern, descr): """ Construct a new ``UnChunkRule``. :type tag_pattern: str :param tag_pattern: This rule's tag pattern. When applied to a ``ChunkString``, this rule will find any complete chunk that matches this tag pattern, and un-chunk it. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ self._pattern = tag_pattern regexp = re.compile('\{(?P%s)\}' % tag_pattern2re_pattern(tag_pattern)) RegexpChunkRule.__init__(self, regexp, '\g', descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return '' @python_2_unicode_compatible class MergeRule(RegexpChunkRule): """ A rule specifying how to merge chunks in a ``ChunkString``, using two matching tag patterns: a left pattern, and a right pattern. When applied to a ``ChunkString``, it will find any chunk whose end matches left pattern, and immediately followed by a chunk whose beginning matches right pattern. It will then merge those two chunks into a single chunk. """ def __init__(self, left_tag_pattern, right_tag_pattern, descr): """ Construct a new ``MergeRule``. :type right_tag_pattern: str :param right_tag_pattern: This rule's right tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose end matches ``left_tag_pattern``, and immediately followed by a chunk whose beginning matches this pattern. It will then merge those two chunks into a single chunk. :type left_tag_pattern: str :param left_tag_pattern: This rule's left tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose end matches this pattern, and immediately followed by a chunk whose beginning matches ``right_tag_pattern``. It will then merge those two chunks into a single chunk. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_tag_pattern)) re.compile(tag_pattern2re_pattern(right_tag_pattern)) self._left_tag_pattern = left_tag_pattern self._right_tag_pattern = right_tag_pattern regexp = re.compile('(?P%s)}{(?=%s)' % (tag_pattern2re_pattern(left_tag_pattern), tag_pattern2re_pattern(right_tag_pattern))) RegexpChunkRule.__init__(self, regexp, '\g', descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', ''> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return ('') @python_2_unicode_compatible class SplitRule(RegexpChunkRule): """ A rule specifying how to split chunks in a ``ChunkString``, using two matching tag patterns: a left pattern, and a right pattern. When applied to a ``ChunkString``, it will find any chunk that matches the left pattern followed by the right pattern. It will then split the chunk into two new chunks, at the point between the two pattern matches. """ def __init__(self, left_tag_pattern, right_tag_pattern, descr): """ Construct a new ``SplitRule``. :type right_tag_pattern: str :param right_tag_pattern: This rule's right tag pattern. When applied to a ``ChunkString``, this rule will find any chunk containing a substring that matches ``left_tag_pattern`` followed by this pattern. It will then split the chunk into two new chunks at the point between these two matching patterns. :type left_tag_pattern: str :param left_tag_pattern: This rule's left tag pattern. When applied to a ``ChunkString``, this rule will find any chunk containing a substring that matches this pattern followed by ``right_tag_pattern``. It will then split the chunk into two new chunks at the point between these two matching patterns. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_tag_pattern)) re.compile(tag_pattern2re_pattern(right_tag_pattern)) self._left_tag_pattern = left_tag_pattern self._right_tag_pattern = right_tag_pattern regexp = re.compile('(?P%s)(?=%s)' % (tag_pattern2re_pattern(left_tag_pattern), tag_pattern2re_pattern(right_tag_pattern))) RegexpChunkRule.__init__(self, regexp, r'\g}{', descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', '
    '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return ('') @python_2_unicode_compatible class ExpandLeftRule(RegexpChunkRule): """ A rule specifying how to expand chunks in a ``ChunkString`` to the left, using two matching tag patterns: a left pattern, and a right pattern. When applied to a ``ChunkString``, it will find any chunk whose beginning matches right pattern, and immediately preceded by a chink whose end matches left pattern. It will then expand the chunk to incorporate the new material on the left. """ def __init__(self, left_tag_pattern, right_tag_pattern, descr): """ Construct a new ``ExpandRightRule``. :type right_tag_pattern: str :param right_tag_pattern: This rule's right tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose beginning matches ``right_tag_pattern``, and immediately preceded by a chink whose end matches this pattern. It will then merge those two chunks into a single chunk. :type left_tag_pattern: str :param left_tag_pattern: This rule's left tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose beginning matches this pattern, and immediately preceded by a chink whose end matches ``left_tag_pattern``. It will then expand the chunk to incorporate the new material on the left. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_tag_pattern)) re.compile(tag_pattern2re_pattern(right_tag_pattern)) self._left_tag_pattern = left_tag_pattern self._right_tag_pattern = right_tag_pattern regexp = re.compile('(?P%s)\{(?P%s)' % (tag_pattern2re_pattern(left_tag_pattern), tag_pattern2re_pattern(right_tag_pattern))) RegexpChunkRule.__init__(self, regexp, '{\g\g', descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', ''> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return ('') @python_2_unicode_compatible class ExpandRightRule(RegexpChunkRule): """ A rule specifying how to expand chunks in a ``ChunkString`` to the right, using two matching tag patterns: a left pattern, and a right pattern. When applied to a ``ChunkString``, it will find any chunk whose end matches left pattern, and immediately followed by a chink whose beginning matches right pattern. It will then expand the chunk to incorporate the new material on the right. """ def __init__(self, left_tag_pattern, right_tag_pattern, descr): """ Construct a new ``ExpandRightRule``. :type right_tag_pattern: str :param right_tag_pattern: This rule's right tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose end matches ``left_tag_pattern``, and immediately followed by a chink whose beginning matches this pattern. It will then merge those two chunks into a single chunk. :type left_tag_pattern: str :param left_tag_pattern: This rule's left tag pattern. When applied to a ``ChunkString``, this rule will find any chunk whose end matches this pattern, and immediately followed by a chink whose beginning matches ``right_tag_pattern``. It will then expand the chunk to incorporate the new material on the right. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_tag_pattern)) re.compile(tag_pattern2re_pattern(right_tag_pattern)) self._left_tag_pattern = left_tag_pattern self._right_tag_pattern = right_tag_pattern regexp = re.compile('(?P%s)\}(?P%s)' % (tag_pattern2re_pattern(left_tag_pattern), tag_pattern2re_pattern(right_tag_pattern))) RegexpChunkRule.__init__(self, regexp, '\g\g}', descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', ''> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return ('') @python_2_unicode_compatible class ChunkRuleWithContext(RegexpChunkRule): """ A rule specifying how to add chunks to a ``ChunkString``, using three matching tag patterns: one for the left context, one for the chunk, and one for the right context. When applied to a ``ChunkString``, it will find any substring that matches the chunk tag pattern, is surrounded by substrings that match the two context patterns, and is not already part of a chunk; and create a new chunk containing the substring that matched the chunk tag pattern. Caveat: Both the left and right context are consumed when this rule matches; therefore, if you need to find overlapping matches, you will need to apply your rule more than once. """ def __init__(self, left_context_tag_pattern, chunk_tag_pattern, right_context_tag_pattern, descr): """ Construct a new ``ChunkRuleWithContext``. :type left_context_tag_pattern: str :param left_context_tag_pattern: A tag pattern that must match the left context of ``chunk_tag_pattern`` for this rule to apply. :type chunk_tag_pattern: str :param chunk_tag_pattern: A tag pattern that must match for this rule to apply. If the rule does apply, then this pattern also identifies the substring that will be made into a chunk. :type right_context_tag_pattern: str :param right_context_tag_pattern: A tag pattern that must match the right context of ``chunk_tag_pattern`` for this rule to apply. :type descr: str :param descr: A short description of the purpose and/or effect of this rule. """ # Ensure that the individual patterns are coherent. E.g., if # left='(' and right=')', then this will raise an exception: re.compile(tag_pattern2re_pattern(left_context_tag_pattern)) re.compile(tag_pattern2re_pattern(chunk_tag_pattern)) re.compile(tag_pattern2re_pattern(right_context_tag_pattern)) self._left_context_tag_pattern = left_context_tag_pattern self._chunk_tag_pattern = chunk_tag_pattern self._right_context_tag_pattern = right_context_tag_pattern regexp = re.compile('(?P%s)(?P%s)(?P%s)%s' % (tag_pattern2re_pattern(left_context_tag_pattern), tag_pattern2re_pattern(chunk_tag_pattern), tag_pattern2re_pattern(right_context_tag_pattern), ChunkString.IN_CHINK_PATTERN)) replacement = r'\g{\g}\g' RegexpChunkRule.__init__(self, regexp, replacement, descr) def __repr__(self): """ Return a string representation of this rule. It has the form:: ', '', '
    '> Note that this representation does not include the description string; that string can be accessed separately with the ``descr()`` method. :rtype: str """ return '' % ( self._left_context_tag_pattern, self._chunk_tag_pattern, self._right_context_tag_pattern) ##////////////////////////////////////////////////////// ## Tag Pattern Format Conversion ##////////////////////////////////////////////////////// # this should probably be made more strict than it is -- e.g., it # currently accepts 'foo'. CHUNK_TAG_PATTERN = re.compile(r'^((%s|<%s>)*)$' % ('[^\{\}<>]+', '[^\{\}<>]+')) def tag_pattern2re_pattern(tag_pattern): """ Convert a tag pattern to a regular expression pattern. A "tag pattern" is a modified version of a regular expression, designed for matching sequences of tags. The differences between regular expression patterns and tag patterns are: - In tag patterns, ``'<'`` and ``'>'`` act as parentheses; so ``'+'`` matches one or more repetitions of ``''``, not ``''``. - Whitespace in tag patterns is ignored. So ``'
    | '`` is equivalant to ``'
    |'`` - In tag patterns, ``'.'`` is equivalant to ``'[^{}<>]'``; so ``''`` matches any single tag starting with ``'NN'``. In particular, ``tag_pattern2re_pattern`` performs the following transformations on the given pattern: - Replace '.' with '[^<>{}]' - Remove any whitespace - Add extra parens around '<' and '>', to make '<' and '>' act like parentheses. E.g., so that in '+', the '+' has scope over the entire ''; and so that in '', the '|' has scope over 'NN' and 'IN', but not '<' or '>'. - Check to make sure the resulting pattern is valid. :type tag_pattern: str :param tag_pattern: The tag pattern to convert to a regular expression pattern. :raise ValueError: If ``tag_pattern`` is not a valid tag pattern. In particular, ``tag_pattern`` should not include braces; and it should not contain nested or mismatched angle-brackets. :rtype: str :return: A regular expression pattern corresponding to ``tag_pattern``. """ # Clean up the regular expression tag_pattern = re.sub(r'\s', '', tag_pattern) tag_pattern = re.sub(r'<', '(<(', tag_pattern) tag_pattern = re.sub(r'>', ')>)', tag_pattern) # Check the regular expression if not CHUNK_TAG_PATTERN.match(tag_pattern): raise ValueError('Bad tag pattern: %r' % tag_pattern) # Replace "." with CHUNK_TAG_CHAR. # We have to do this after, since it adds {}[]<>s, which would # confuse CHUNK_TAG_PATTERN. # PRE doesn't have lookback assertions, so reverse twice, and do # the pattern backwards (with lookahead assertions). This can be # made much cleaner once we can switch back to SRE. def reverse_str(str): lst = list(str) lst.reverse() return ''.join(lst) tc_rev = reverse_str(ChunkString.CHUNK_TAG_CHAR) reversed = reverse_str(tag_pattern) reversed = re.sub(r'\.(?!\\(\\\\)*($|[^\\]))', tc_rev, reversed) tag_pattern = reverse_str(reversed) return tag_pattern ##////////////////////////////////////////////////////// ## RegexpChunkParser ##////////////////////////////////////////////////////// @python_2_unicode_compatible class RegexpChunkParser(ChunkParserI): """ A regular expression based chunk parser. ``RegexpChunkParser`` uses a sequence of "rules" to find chunks of a single type within a text. The chunking of the text is encoded using a ``ChunkString``, and each rule acts by modifying the chunking in the ``ChunkString``. The rules are all implemented using regular expression matching and substitution. The ``RegexpChunkRule`` class and its subclasses (``ChunkRule``, ``ChinkRule``, ``UnChunkRule``, ``MergeRule``, and ``SplitRule``) define the rules that are used by ``RegexpChunkParser``. Each rule defines an ``apply()`` method, which modifies the chunking encoded by a given ``ChunkString``. :type _rules: list(RegexpChunkRule) :ivar _rules: The list of rules that should be applied to a text. :type _trace: int :ivar _trace: The default level of tracing. """ def __init__(self, rules, chunk_label='NP', root_label='S', trace=0): """ Construct a new ``RegexpChunkParser``. :type rules: list(RegexpChunkRule) :param rules: The sequence of rules that should be used to generate the chunking for a tagged text. :type chunk_label: str :param chunk_label: The node value that should be used for chunk subtrees. This is typically a short string describing the type of information contained by the chunk, such as ``"NP"`` for base noun phrases. :type root_label: str :param root_label: The node value that should be used for the top node of the chunk structure. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or higher will generate verbose tracing output. """ self._rules = rules self._trace = trace self._chunk_label = chunk_label self._root_label = root_label def _trace_apply(self, chunkstr, verbose): """ Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in turn. Generate trace output between each rule. If ``verbose`` is true, then generate verbose output. :type chunkstr: ChunkString :param chunkstr: The chunk string to which each rule should be applied. :type verbose: bool :param verbose: Whether output should be verbose. :rtype: None """ print('# Input:') print(chunkstr) for rule in self._rules: rule.apply(chunkstr) if verbose: print('#', rule.descr()+' ('+unicode_repr(rule)+'):') else: print('#', rule.descr()+':') print(chunkstr) def _notrace_apply(self, chunkstr): """ Apply each rule of this ``RegexpChunkParser`` to ``chunkstr``, in turn. :param chunkstr: The chunk string to which each rule should be applied. :type chunkstr: ChunkString :rtype: None """ for rule in self._rules: rule.apply(chunkstr) def parse(self, chunk_struct, trace=None): """ :type chunk_struct: Tree :param chunk_struct: the chunk structure to be (further) chunked :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or highter will generate verbose tracing output. This value overrides the trace level value that was given to the constructor. :rtype: Tree :return: a chunk structure that encodes the chunks in a given tagged sentence. A chunk is a non-overlapping linguistic group, such as a noun phrase. The set of chunks identified in the chunk structure depends on the rules used to define this ``RegexpChunkParser``. """ if len(chunk_struct) == 0: print('Warning: parsing empty text') return Tree(self._root_label, []) try: chunk_struct.label() except AttributeError: chunk_struct = Tree(self._root_label, chunk_struct) # Use the default trace value? if trace is None: trace = self._trace chunkstr = ChunkString(chunk_struct) # Apply the sequence of rules to the chunkstring. if trace: verbose = (trace>1) self._trace_apply(chunkstr, verbose) else: self._notrace_apply(chunkstr) # Use the chunkstring to create a chunk structure. return chunkstr.to_chunkstruct(self._chunk_label) def rules(self): """ :return: the sequence of rules used by ``RegexpChunkParser``. :rtype: list(RegexpChunkRule) """ return self._rules def __repr__(self): """ :return: a concise string representation of this ``RegexpChunkParser``. :rtype: str """ return "" % len(self._rules) def __str__(self): """ :return: a verbose string representation of this ``RegexpChunkParser``. :rtype: str """ s = "RegexpChunkParser with %d rules:\n" % len(self._rules) margin = 0 for rule in self._rules: margin = max(margin, len(rule.descr())) if margin < 35: format = " %" + repr(-(margin+3)) + "s%s\n" else: format = " %s\n %s\n" for rule in self._rules: s += format % (rule.descr(), unicode_repr(rule)) return s[:-1] ##////////////////////////////////////////////////////// ## Chunk Grammar ##////////////////////////////////////////////////////// @python_2_unicode_compatible class RegexpParser(ChunkParserI): """ A grammar based chunk parser. ``chunk.RegexpParser`` uses a set of regular expression patterns to specify the behavior of the parser. The chunking of the text is encoded using a ``ChunkString``, and each rule acts by modifying the chunking in the ``ChunkString``. The rules are all implemented using regular expression matching and substitution. A grammar contains one or more clauses in the following form:: NP: {} # chunk determiners and adjectives }<[\.VI].*>+{ # chink any tag beginning with V, I, or . <.*>}{
    # split a chunk at a determiner {} # merge chunk ending with det/adj # with one starting with a noun The patterns of a clause are executed in order. An earlier pattern may introduce a chunk boundary that prevents a later pattern from executing. Sometimes an individual pattern will match on multiple, overlapping extents of the input. As with regular expression substitution more generally, the chunker will identify the first match possible, then continue looking for matches after this one has ended. The clauses of a grammar are also executed in order. A cascaded chunk parser is one having more than one clause. The maximum depth of a parse tree created by this chunk parser is the same as the number of clauses in the grammar. When tracing is turned on, the comment portion of a line is displayed each time the corresponding pattern is applied. :type _start: str :ivar _start: The start symbol of the grammar (the root node of resulting trees) :type _stages: int :ivar _stages: The list of parsing stages corresponding to the grammar """ def __init__(self, grammar, root_label='S', loop=1, trace=0): """ Create a new chunk parser, from the given start state and set of chunk patterns. :param grammar: The grammar, or a list of RegexpChunkParser objects :type grammar: str or list(RegexpChunkParser) :param root_label: The top node of the tree being created :type root_label: str or Nonterminal :param loop: The number of times to run through the patterns :type loop: int :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or higher will generate verbose tracing output. """ self._trace = trace self._stages = [] self._grammar = grammar self._loop = loop if isinstance(grammar, string_types): self._read_grammar(grammar, root_label, trace) else: # Make sur the grammar looks like it has the right type: type_err = ('Expected string or list of RegexpChunkParsers ' 'for the grammar.') try: grammar = list(grammar) except: raise TypeError(type_err) for elt in grammar: if not isinstance(elt, RegexpChunkParser): raise TypeError(type_err) self._stages = grammar def _read_grammar(self, grammar, root_label, trace): """ Helper function for __init__: read the grammar if it is a string. """ rules = [] lhs = None for line in grammar.split('\n'): line = line.strip() # New stage begins if there's an unescaped ':' m = re.match('(?P(\\.|[^:])*)(:(?P.*))', line) if m: # Record the stage that we just completed. self._add_stage(rules, lhs, root_label, trace) # Start a new stage. lhs = m.group('nonterminal').strip() rules = [] line = m.group('rule').strip() # Skip blank & comment-only lines if line=='' or line.startswith('#'): continue # Add the rule rules.append(RegexpChunkRule.fromstring(line)) # Record the final stage self._add_stage(rules, lhs, root_label, trace) def _add_stage(self, rules, lhs, root_label, trace): """ Helper function for __init__: add a new stage to the parser. """ if rules != []: if not lhs: raise ValueError('Expected stage marker (eg NP:)') parser = RegexpChunkParser(rules, chunk_label=lhs, root_label=root_label, trace=trace) self._stages.append(parser) def parse(self, chunk_struct, trace=None): """ Apply the chunk parser to this input. :type chunk_struct: Tree :param chunk_struct: the chunk structure to be (further) chunked (this tree is modified, and is also returned) :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or highter will generate verbose tracing output. This value overrides the trace level value that was given to the constructor. :return: the chunked output. :rtype: Tree """ if trace is None: trace = self._trace for i in range(self._loop): for parser in self._stages: chunk_struct = parser.parse(chunk_struct, trace=trace) return chunk_struct def __repr__(self): """ :return: a concise string representation of this ``chunk.RegexpParser``. :rtype: str """ return "" % len(self._stages) def __str__(self): """ :return: a verbose string representation of this ``RegexpParser``. :rtype: str """ s = "chunk.RegexpParser with %d stages:\n" % len(self._stages) margin = 0 for parser in self._stages: s += "%s\n" % parser return s[:-1] ##////////////////////////////////////////////////////// ## Demonstration code ##////////////////////////////////////////////////////// def demo_eval(chunkparser, text): """ Demonstration code for evaluating a chunk parser, using a ``ChunkScore``. This function assumes that ``text`` contains one sentence per line, and that each sentence has the form expected by ``tree.chunk``. It runs the given chunk parser on each sentence in the text, and scores the result. It prints the final score (precision, recall, and f-measure); and reports the set of chunks that were missed and the set of chunks that were incorrect. (At most 10 missing chunks and 10 incorrect chunks are reported). :param chunkparser: The chunkparser to be tested :type chunkparser: ChunkParserI :param text: The chunked tagged text that should be used for evaluation. :type text: str """ from nltk import chunk from nltk.tree import Tree # Evaluate our chunk parser. chunkscore = chunk.ChunkScore() for sentence in text.split('\n'): print(sentence) sentence = sentence.strip() if not sentence: continue gold = chunk.tagstr2tree(sentence) tokens = gold.leaves() test = chunkparser.parse(Tree('S', tokens), trace=1) chunkscore.score(gold, test) print() print('/'+('='*75)+'\\') print('Scoring', chunkparser) print(('-'*77)) print('Precision: %5.1f%%' % (chunkscore.precision()*100), ' '*4, end=' ') print('Recall: %5.1f%%' % (chunkscore.recall()*100), ' '*6, end=' ') print('F-Measure: %5.1f%%' % (chunkscore.f_measure()*100)) # Missed chunks. if chunkscore.missed(): print('Missed:') missed = chunkscore.missed() for chunk in missed[:10]: print(' ', ' '.join(map(str,chunk))) if len(chunkscore.missed()) > 10: print(' ...') # Incorrect chunks. if chunkscore.incorrect(): print('Incorrect:') incorrect = chunkscore.incorrect() for chunk in incorrect[:10]: print(' ', ' '.join(map(str,chunk))) if len(chunkscore.incorrect()) > 10: print(' ...') print('\\'+('='*75)+'/') print() def demo(): """ A demonstration for the ``RegexpChunkParser`` class. A single text is parsed with four different chunk parsers, using a variety of rules and strategies. """ from nltk import chunk, Tree text = """\ [ the/DT little/JJ cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] ./. [ John/NNP ] saw/VBD [the/DT cats/NNS] [the/DT dog/NN] chased/VBD ./. [ John/NNP ] thinks/VBZ [ Mary/NN ] saw/VBD [ the/DT cat/NN ] sit/VB on/IN [ the/DT mat/NN ]./. """ print('*'*75) print('Evaluation text:') print(text) print('*'*75) print() grammar = r""" NP: # NP stage {
    ?*} # chunk determiners, adjectives and nouns {+} # chunk proper nouns """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods {} # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) grammar = r""" NP: {
    ?*} # chunk determiners, adjectives and nouns VP: {?} # VP = verb words """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # chink any verbs, prepositions or periods <.*>}{
    # separate on determiners PP: {} # PP = preposition + noun phrase VP: {*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) demo_eval(cp, text) # Evaluation from nltk.corpus import conll2000 print() print("Demonstration of empty grammar:") cp = chunk.RegexpParser("") print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt', chunk_types=('NP',)))) print() print("Demonstration of accuracy evaluation using CoNLL tags:") grammar = r""" NP: {<.*>} # start by chunking each tag }<[\.VI].*>+{ # unchunk any verbs, prepositions or periods {} # merge det/adj with nouns """ cp = chunk.RegexpParser(grammar) print(chunk.accuracy(cp, conll2000.chunked_sents('test.txt')[:5])) print() print("Demonstration of tagged token input") grammar = r""" NP: {<.*>*} # start by chunking everything }<[\.VI].*>+{ # chink any verbs, prepositions or periods <.*>}{
    # separate on determiners PP: {} # PP = preposition + noun phrase VP: {*} # VP = verb words + NPs and PPs """ cp = chunk.RegexpParser(grammar) print(cp.parse([("the","DT"), ("little","JJ"), ("cat", "NN"), ("sat", "VBD"), ("on", "IN"), ("the", "DT"), ("mat", "NN"), (".", ".")])) if __name__ == '__main__': demo() nltk-3.1/nltk/chunk/util.py0000644000076500000240000005025712607224144015476 0ustar sbstaff00000000000000# Natural Language Toolkit: Chunk format conversions # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals import re from nltk.tree import Tree from nltk.tag.mapping import map_tag from nltk.tag.util import str2tuple from nltk.compat import python_2_unicode_compatible ##////////////////////////////////////////////////////// ## EVALUATION ##////////////////////////////////////////////////////// from nltk.metrics import accuracy as _accuracy def accuracy(chunker, gold): """ Score the accuracy of the chunker against the gold standard. Strip the chunk information from the gold standard and rechunk it using the chunker, then compute the accuracy score. :type chunker: ChunkParserI :param chunker: The chunker being evaluated. :type gold: tree :param gold: The chunk structures to score the chunker on. :rtype: float """ gold_tags = [] test_tags = [] for gold_tree in gold: test_tree = chunker.parse(gold_tree.flatten()) gold_tags += tree2conlltags(gold_tree) test_tags += tree2conlltags(test_tree) # print 'GOLD:', gold_tags[:50] # print 'TEST:', test_tags[:50] return _accuracy(gold_tags, test_tags) # Patched for increased performance by Yoav Goldberg , 2006-01-13 # -- statistics are evaluated only on demand, instead of at every sentence evaluation # # SB: use nltk.metrics for precision/recall scoring? # class ChunkScore(object): """ A utility class for scoring chunk parsers. ``ChunkScore`` can evaluate a chunk parser's output, based on a number of statistics (precision, recall, f-measure, misssed chunks, incorrect chunks). It can also combine the scores from the parsing of multiple texts; this makes it significantly easier to evaluate a chunk parser that operates one sentence at a time. Texts are evaluated with the ``score`` method. The results of evaluation can be accessed via a number of accessor methods, such as ``precision`` and ``f_measure``. A typical use of the ``ChunkScore`` class is:: >>> chunkscore = ChunkScore() # doctest: +SKIP >>> for correct in correct_sentences: # doctest: +SKIP ... guess = chunkparser.parse(correct.leaves()) # doctest: +SKIP ... chunkscore.score(correct, guess) # doctest: +SKIP >>> print('F Measure:', chunkscore.f_measure()) # doctest: +SKIP F Measure: 0.823 :ivar kwargs: Keyword arguments: - max_tp_examples: The maximum number actual examples of true positives to record. This affects the ``correct`` member function: ``correct`` will not return more than this number of true positive examples. This does *not* affect any of the numerical metrics (precision, recall, or f-measure) - max_fp_examples: The maximum number actual examples of false positives to record. This affects the ``incorrect`` member function and the ``guessed`` member function: ``incorrect`` will not return more than this number of examples, and ``guessed`` will not return more than this number of true positive examples. This does *not* affect any of the numerical metrics (precision, recall, or f-measure) - max_fn_examples: The maximum number actual examples of false negatives to record. This affects the ``missed`` member function and the ``correct`` member function: ``missed`` will not return more than this number of examples, and ``correct`` will not return more than this number of true negative examples. This does *not* affect any of the numerical metrics (precision, recall, or f-measure) - chunk_label: A regular expression indicating which chunks should be compared. Defaults to ``'.*'`` (i.e., all chunks). :type _tp: list(Token) :ivar _tp: List of true positives :type _fp: list(Token) :ivar _fp: List of false positives :type _fn: list(Token) :ivar _fn: List of false negatives :type _tp_num: int :ivar _tp_num: Number of true positives :type _fp_num: int :ivar _fp_num: Number of false positives :type _fn_num: int :ivar _fn_num: Number of false negatives. """ def __init__(self, **kwargs): self._correct = set() self._guessed = set() self._tp = set() self._fp = set() self._fn = set() self._max_tp = kwargs.get('max_tp_examples', 100) self._max_fp = kwargs.get('max_fp_examples', 100) self._max_fn = kwargs.get('max_fn_examples', 100) self._chunk_label = kwargs.get('chunk_label', '.*') self._tp_num = 0 self._fp_num = 0 self._fn_num = 0 self._count = 0 self._tags_correct = 0.0 self._tags_total = 0.0 self._measuresNeedUpdate = False def _updateMeasures(self): if (self._measuresNeedUpdate): self._tp = self._guessed & self._correct self._fn = self._correct - self._guessed self._fp = self._guessed - self._correct self._tp_num = len(self._tp) self._fp_num = len(self._fp) self._fn_num = len(self._fn) self._measuresNeedUpdate = False def score(self, correct, guessed): """ Given a correctly chunked sentence, score another chunked version of the same sentence. :type correct: chunk structure :param correct: The known-correct ("gold standard") chunked sentence. :type guessed: chunk structure :param guessed: The chunked sentence to be scored. """ self._correct |= _chunksets(correct, self._count, self._chunk_label) self._guessed |= _chunksets(guessed, self._count, self._chunk_label) self._count += 1 self._measuresNeedUpdate = True # Keep track of per-tag accuracy (if possible) try: correct_tags = tree2conlltags(correct) guessed_tags = tree2conlltags(guessed) except ValueError: # This exception case is for nested chunk structures, # where tree2conlltags will fail with a ValueError: "Tree # is too deeply nested to be printed in CoNLL format." correct_tags = guessed_tags = () self._tags_total += len(correct_tags) self._tags_correct += sum(1 for (t,g) in zip(guessed_tags, correct_tags) if t==g) def accuracy(self): """ Return the overall tag-based accuracy for all text that have been scored by this ``ChunkScore``, using the IOB (conll2000) tag encoding. :rtype: float """ if self._tags_total == 0: return 1 return self._tags_correct/self._tags_total def precision(self): """ Return the overall precision for all texts that have been scored by this ``ChunkScore``. :rtype: float """ self._updateMeasures() div = self._tp_num + self._fp_num if div == 0: return 0 else: return float(self._tp_num) / div def recall(self): """ Return the overall recall for all texts that have been scored by this ``ChunkScore``. :rtype: float """ self._updateMeasures() div = self._tp_num + self._fn_num if div == 0: return 0 else: return float(self._tp_num) / div def f_measure(self, alpha=0.5): """ Return the overall F measure for all texts that have been scored by this ``ChunkScore``. :param alpha: the relative weighting of precision and recall. Larger alpha biases the score towards the precision value, while smaller alpha biases the score towards the recall value. ``alpha`` should have a value in the range [0,1]. :type alpha: float :rtype: float """ self._updateMeasures() p = self.precision() r = self.recall() if p == 0 or r == 0: # what if alpha is 0 or 1? return 0 return 1/(alpha/p + (1-alpha)/r) def missed(self): """ Return the chunks which were included in the correct chunk structures, but not in the guessed chunk structures, listed in input order. :rtype: list of chunks """ self._updateMeasures() chunks = list(self._fn) return [c[1] for c in chunks] # discard position information def incorrect(self): """ Return the chunks which were included in the guessed chunk structures, but not in the correct chunk structures, listed in input order. :rtype: list of chunks """ self._updateMeasures() chunks = list(self._fp) return [c[1] for c in chunks] # discard position information def correct(self): """ Return the chunks which were included in the correct chunk structures, listed in input order. :rtype: list of chunks """ chunks = list(self._correct) return [c[1] for c in chunks] # discard position information def guessed(self): """ Return the chunks which were included in the guessed chunk structures, listed in input order. :rtype: list of chunks """ chunks = list(self._guessed) return [c[1] for c in chunks] # discard position information def __len__(self): self._updateMeasures() return self._tp_num + self._fn_num def __repr__(self): """ Return a concise representation of this ``ChunkScoring``. :rtype: str """ return '' def __str__(self): """ Return a verbose representation of this ``ChunkScoring``. This representation includes the precision, recall, and f-measure scores. For other information about the score, use the accessor methods (e.g., ``missed()`` and ``incorrect()``). :rtype: str """ return ("ChunkParse score:\n" + (" IOB Accuracy: %5.1f%%\n" % (self.accuracy()*100)) + (" Precision: %5.1f%%\n" % (self.precision()*100)) + (" Recall: %5.1f%%\n" % (self.recall()*100))+ (" F-Measure: %5.1f%%" % (self.f_measure()*100))) # extract chunks, and assign unique id, the absolute position of # the first word of the chunk def _chunksets(t, count, chunk_label): pos = 0 chunks = [] for child in t: if isinstance(child, Tree): if re.match(chunk_label, child.label()): chunks.append(((count, pos), child.freeze())) pos += len(child.leaves()) else: pos += 1 return set(chunks) def tagstr2tree(s, chunk_label="NP", root_label="S", sep='/', source_tagset=None, target_tagset=None): """ Divide a string of bracketted tagged text into chunks and unchunked tokens, and produce a Tree. Chunks are marked by square brackets (``[...]``). Words are delimited by whitespace, and each word should have the form ``text/tag``. Words that do not contain a slash are assigned a ``tag`` of None. :param s: The string to be converted :type s: str :param chunk_label: The label to use for chunk nodes :type chunk_label: str :param root_label: The label to use for the root of the tree :type root_label: str :rtype: Tree """ WORD_OR_BRACKET = re.compile(r'\[|\]|[^\[\]\s]+') stack = [Tree(root_label, [])] for match in WORD_OR_BRACKET.finditer(s): text = match.group() if text[0] == '[': if len(stack) != 1: raise ValueError('Unexpected [ at char %d' % match.start()) chunk = Tree(chunk_label, []) stack[-1].append(chunk) stack.append(chunk) elif text[0] == ']': if len(stack) != 2: raise ValueError('Unexpected ] at char %d' % match.start()) stack.pop() else: if sep is None: stack[-1].append(text) else: word, tag = str2tuple(text, sep) if source_tagset and target_tagset: tag = map_tag(source_tagset, target_tagset, tag) stack[-1].append((word, tag)) if len(stack) != 1: raise ValueError('Expected ] at char %d' % len(s)) return stack[0] ### CONLL _LINE_RE = re.compile('(\S+)\s+(\S+)\s+([IOB])-?(\S+)?') def conllstr2tree(s, chunk_types=('NP', 'PP', 'VP'), root_label="S"): """ Return a chunk structure for a single sentence encoded in the given CONLL 2000 style string. This function converts a CoNLL IOB string into a tree. It uses the specified chunk types (defaults to NP, PP and VP), and creates a tree rooted at a node labeled S (by default). :param s: The CoNLL string to be converted. :type s: str :param chunk_types: The chunk types to be converted. :type chunk_types: tuple :param root_label: The node label to use for the root. :type root_label: str :rtype: Tree """ stack = [Tree(root_label, [])] for lineno, line in enumerate(s.split('\n')): if not line.strip(): continue # Decode the line. match = _LINE_RE.match(line) if match is None: raise ValueError('Error on line %d' % lineno) (word, tag, state, chunk_type) = match.groups() # If it's a chunk type we don't care about, treat it as O. if (chunk_types is not None and chunk_type not in chunk_types): state = 'O' # For "Begin"/"Outside", finish any completed chunks - # also do so for "Inside" which don't match the previous token. mismatch_I = state == 'I' and chunk_type != stack[-1].label() if state in 'BO' or mismatch_I: if len(stack) == 2: stack.pop() # For "Begin", start a new chunk. if state == 'B' or mismatch_I: chunk = Tree(chunk_type, []) stack[-1].append(chunk) stack.append(chunk) # Add the new word token. stack[-1].append((word, tag)) return stack[0] def tree2conlltags(t): """ Return a list of 3-tuples containing ``(word, tag, IOB-tag)``. Convert a tree to the CoNLL IOB tag format. :param t: The tree to be converted. :type t: Tree :rtype: list(tuple) """ tags = [] for child in t: try: category = child.label() prefix = "B-" for contents in child: if isinstance(contents, Tree): raise ValueError("Tree is too deeply nested to be printed in CoNLL format") tags.append((contents[0], contents[1], prefix+category)) prefix = "I-" except AttributeError: tags.append((child[0], child[1], "O")) return tags def conlltags2tree(sentence, chunk_types=('NP','PP','VP'), root_label='S', strict=False): """ Convert the CoNLL IOB format to a tree. """ tree = Tree(root_label, []) for (word, postag, chunktag) in sentence: if chunktag is None: if strict: raise ValueError("Bad conll tag sequence") else: # Treat as O tree.append((word,postag)) elif chunktag.startswith('B-'): tree.append(Tree(chunktag[2:], [(word,postag)])) elif chunktag.startswith('I-'): if (len(tree)==0 or not isinstance(tree[-1], Tree) or tree[-1].label() != chunktag[2:]): if strict: raise ValueError("Bad conll tag sequence") else: # Treat as B-* tree.append(Tree(chunktag[2:], [(word,postag)])) else: tree[-1].append((word,postag)) elif chunktag == 'O': tree.append((word,postag)) else: raise ValueError("Bad conll tag %r" % chunktag) return tree def tree2conllstr(t): """ Return a multiline string where each line contains a word, tag and IOB tag. Convert a tree to the CoNLL IOB string format :param t: The tree to be converted. :type t: Tree :rtype: str """ lines = [" ".join(token) for token in tree2conlltags(t)] return '\n'.join(lines) ### IEER _IEER_DOC_RE = re.compile(r'\s*' r'(\s*(?P.+?)\s*\s*)?' r'(\s*(?P.+?)\s*\s*)?' r'(\s*(?P.+?)\s*\s*)?' r'\s*' r'(\s*(?P.+?)\s*\s*)?' r'(?P.*?)\s*' r'\s*\s*', re.DOTALL) _IEER_TYPE_RE = re.compile(']*?type="(?P\w+)"') def _ieer_read_text(s, root_label): stack = [Tree(root_label, [])] # s will be None if there is no headline in the text # return the empty list in place of a Tree if s is None: return [] for piece_m in re.finditer('<[^>]+>|[^\s<]+', s): piece = piece_m.group() try: if piece.startswith('.... m = _IEER_DOC_RE.match(s) if m: return { 'text': _ieer_read_text(m.group('text'), root_label), 'docno': m.group('docno'), 'doctype': m.group('doctype'), 'date_time': m.group('date_time'), #'headline': m.group('headline') # we want to capture NEs in the headline too! 'headline': _ieer_read_text(m.group('headline'), root_label), } else: return _ieer_read_text(s, root_label) def demo(): s = "[ Pierre/NNP Vinken/NNP ] ,/, [ 61/CD years/NNS ] old/JJ ,/, will/MD join/VB [ the/DT board/NN ] ./." import nltk t = nltk.chunk.tagstr2tree(s, chunk_label='NP') t.pprint() print() s = """ These DT B-NP research NN I-NP protocols NNS I-NP offer VBP B-VP to TO B-PP the DT B-NP patient NN I-NP not RB O only RB O the DT B-NP very RB I-NP best JJS I-NP therapy NN I-NP which WDT B-NP we PRP B-NP have VBP B-VP established VBN I-VP today NN B-NP but CC B-NP also RB I-NP the DT B-NP hope NN I-NP of IN B-PP something NN B-NP still RB B-ADJP better JJR I-ADJP . . O """ conll_tree = conllstr2tree(s, chunk_types=('NP', 'PP')) conll_tree.pprint() # Demonstrate CoNLL output print("CoNLL output:") print(nltk.chunk.tree2conllstr(conll_tree)) print() if __name__ == '__main__': demo() nltk-3.1/nltk/classify/0000755000076500000240000000000012610001541014627 5ustar sbstaff00000000000000nltk-3.1/nltk/classify/__init__.py0000644000076500000240000001067212607224144016762 0ustar sbstaff00000000000000# Natural Language Toolkit: Classifiers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Classes and interfaces for labeling tokens with category labels (or "class labels"). Typically, labels are represented with strings (such as ``'health'`` or ``'sports'``). Classifiers can be used to perform a wide range of classification tasks. For example, classifiers can be used... - to classify documents by topic - to classify ambiguous words by which word sense is intended - to classify acoustic signals by which phoneme they represent - to classify sentences by their author Features ======== In order to decide which category label is appropriate for a given token, classifiers examine one or more 'features' of the token. These "features" are typically chosen by hand, and indicate which aspects of the token are relevant to the classification decision. For example, a document classifier might use a separate feature for each word, recording how often that word occurred in the document. Featuresets =========== The features describing a token are encoded using a "featureset", which is a dictionary that maps from "feature names" to "feature values". Feature names are unique strings that indicate what aspect of the token is encoded by the feature. Examples include ``'prevword'``, for a feature whose value is the previous word; and ``'contains-word(library)'`` for a feature that is true when a document contains the word ``'library'``. Feature values are typically booleans, numbers, or strings, depending on which feature they describe. Featuresets are typically constructed using a "feature detector" (also known as a "feature extractor"). A feature detector is a function that takes a token (and sometimes information about its context) as its input, and returns a featureset describing that token. For example, the following feature detector converts a document (stored as a list of words) to a featureset describing the set of words included in the document: >>> # Define a feature detector function. >>> def document_features(document): ... return dict([('contains-word(%s)' % w, True) for w in document]) Feature detectors are typically applied to each token before it is fed to the classifier: >>> # Classify each Gutenberg document. >>> from nltk.corpus import gutenberg >>> for fileid in gutenberg.fileids(): # doctest: +SKIP ... doc = gutenberg.words(fileid) # doctest: +SKIP ... print fileid, classifier.classify(document_features(doc)) # doctest: +SKIP The parameters that a feature detector expects will vary, depending on the task and the needs of the feature detector. For example, a feature detector for word sense disambiguation (WSD) might take as its input a sentence, and the index of a word that should be classified, and return a featureset for that word. The following feature detector for WSD includes features describing the left and right contexts of the target word: >>> def wsd_features(sentence, index): ... featureset = {} ... for i in range(max(0, index-3), index): ... featureset['left-context(%s)' % sentence[i]] = True ... for i in range(index, max(index+3, len(sentence))): ... featureset['right-context(%s)' % sentence[i]] = True ... return featureset Training Classifiers ==================== Most classifiers are built by training them on a list of hand-labeled examples, known as the "training set". Training sets are represented as lists of ``(featuredict, label)`` tuples. """ from nltk.classify.api import ClassifierI, MultiClassifierI from nltk.classify.megam import config_megam, call_megam from nltk.classify.weka import WekaClassifier, config_weka from nltk.classify.naivebayes import NaiveBayesClassifier from nltk.classify.positivenaivebayes import PositiveNaiveBayesClassifier from nltk.classify.decisiontree import DecisionTreeClassifier from nltk.classify.rte_classify import rte_classifier, rte_features, RTEFeatureExtractor from nltk.classify.util import accuracy, apply_features, log_likelihood from nltk.classify.scikitlearn import SklearnClassifier from nltk.classify.maxent import (MaxentClassifier, BinaryMaxentFeatureEncoding, TypedMaxentFeatureEncoding, ConditionalExponentialClassifier) from nltk.classify.senna import Senna from nltk.classify.textcat import TextCat nltk-3.1/nltk/classify/api.py0000644000076500000240000001445612607224144016000 0ustar sbstaff00000000000000# Natural Language Toolkit: Classifier Interface # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT """ Interfaces for labeling tokens with category labels (or "class labels"). ``ClassifierI`` is a standard interface for "single-category classification", in which the set of categories is known, the number of categories is finite, and each text belongs to exactly one category. ``MultiClassifierI`` is a standard interface for "multi-category classification", which is like single-category classification except that each text belongs to zero or more categories. """ from nltk.internals import overridden ##////////////////////////////////////////////////////// #{ Classification Interfaces ##////////////////////////////////////////////////////// class ClassifierI(object): """ A processing interface for labeling tokens with a single category label (or "class"). Labels are typically strs or ints, but can be any immutable type. The set of labels that the classifier chooses from must be fixed and finite. Subclasses must define: - ``labels()`` - either ``classify()`` or ``classify_many()`` (or both) Subclasses may define: - either ``prob_classify()`` or ``prob_classify_many()`` (or both) """ def labels(self): """ :return: the list of category labels used by this classifier. :rtype: list of (immutable) """ raise NotImplementedError() def classify(self, featureset): """ :return: the most appropriate label for the given featureset. :rtype: label """ if overridden(self.classify_many): return self.classify_many([featureset])[0] else: raise NotImplementedError() def prob_classify(self, featureset): """ :return: a probability distribution over labels for the given featureset. :rtype: ProbDistI """ if overridden(self.prob_classify_many): return self.prob_classify_many([featureset])[0] else: raise NotImplementedError() def classify_many(self, featuresets): """ Apply ``self.classify()`` to each element of ``featuresets``. I.e.: return [self.classify(fs) for fs in featuresets] :rtype: list(label) """ return [self.classify(fs) for fs in featuresets] def prob_classify_many(self, featuresets): """ Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.: return [self.prob_classify(fs) for fs in featuresets] :rtype: list(ProbDistI) """ return [self.prob_classify(fs) for fs in featuresets] class MultiClassifierI(object): """ A processing interface for labeling tokens with zero or more category labels (or "labels"). Labels are typically strs or ints, but can be any immutable type. The set of labels that the multi-classifier chooses from must be fixed and finite. Subclasses must define: - ``labels()`` - either ``classify()`` or ``classify_many()`` (or both) Subclasses may define: - either ``prob_classify()`` or ``prob_classify_many()`` (or both) """ def labels(self): """ :return: the list of category labels used by this classifier. :rtype: list of (immutable) """ raise NotImplementedError() def classify(self, featureset): """ :return: the most appropriate set of labels for the given featureset. :rtype: set(label) """ if overridden(self.classify_many): return self.classify_many([featureset])[0] else: raise NotImplementedError() def prob_classify(self, featureset): """ :return: a probability distribution over sets of labels for the given featureset. :rtype: ProbDistI """ if overridden(self.prob_classify_many): return self.prob_classify_many([featureset])[0] else: raise NotImplementedError() def classify_many(self, featuresets): """ Apply ``self.classify()`` to each element of ``featuresets``. I.e.: return [self.classify(fs) for fs in featuresets] :rtype: list(set(label)) """ return [self.classify(fs) for fs in featuresets] def prob_classify_many(self, featuresets): """ Apply ``self.prob_classify()`` to each element of ``featuresets``. I.e.: return [self.prob_classify(fs) for fs in featuresets] :rtype: list(ProbDistI) """ return [self.prob_classify(fs) for fs in featuresets] # # [XX] IN PROGRESS: # class SequenceClassifierI(object): # """ # A processing interface for labeling sequences of tokens with a # single category label (or "class"). Labels are typically # strs or ints, but can be any immutable type. The set # of labels that the classifier chooses from must be fixed and # finite. # """ # def labels(self): # """ # :return: the list of category labels used by this classifier. # :rtype: list of (immutable) # """ # raise NotImplementedError() # def prob_classify(self, featureset): # """ # Return a probability distribution over labels for the given # featureset. # If ``featureset`` is a list of featuresets, then return a # corresponding list containing the probability distribution # over labels for each of the given featuresets, where the # *i*\ th element of this list is the most appropriate label for # the *i*\ th element of ``featuresets``. # """ # raise NotImplementedError() # def classify(self, featureset): # """ # Return the most appropriate label for the given featureset. # If ``featureset`` is a list of featuresets, then return a # corresponding list containing the most appropriate label for # each of the given featuresets, where the *i*\ th element of # this list is the most appropriate label for the *i*\ th element # of ``featuresets``. # """ # raise NotImplementedError() nltk-3.1/nltk/classify/decisiontree.py0000644000076500000240000002765612607224144017712 0ustar sbstaff00000000000000# Natural Language Toolkit: Decision Tree Classifiers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A classifier model that decides which label to assign to a token on the basis of a tree structure, where branches correspond to conditions on feature values, and leaves correspond to label assignments. """ from __future__ import print_function, unicode_literals from collections import defaultdict from nltk.probability import FreqDist, MLEProbDist, entropy from nltk.classify.api import ClassifierI from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class DecisionTreeClassifier(ClassifierI): def __init__(self, label, feature_name=None, decisions=None, default=None): """ :param label: The most likely label for tokens that reach this node in the decision tree. If this decision tree has no children, then this label will be assigned to any token that reaches this decision tree. :param feature_name: The name of the feature that this decision tree selects for. :param decisions: A dictionary mapping from feature values for the feature identified by ``feature_name`` to child decision trees. :param default: The child that will be used if the value of feature ``feature_name`` does not match any of the keys in ``decisions``. This is used when constructing binary decision trees. """ self._label = label self._fname = feature_name self._decisions = decisions self._default = default def labels(self): labels = [self._label] if self._decisions is not None: for dt in self._decisions.values(): labels.extend(dt.labels()) if self._default is not None: labels.extend(self._default.labels()) return list(set(labels)) def classify(self, featureset): # Decision leaf: if self._fname is None: return self._label # Decision tree: fval = featureset.get(self._fname) if fval in self._decisions: return self._decisions[fval].classify(featureset) elif self._default is not None: return self._default.classify(featureset) else: return self._label def error(self, labeled_featuresets): errors = 0 for featureset, label in labeled_featuresets: if self.classify(featureset) != label: errors += 1 return float(errors)/len(labeled_featuresets) def pretty_format(self, width=70, prefix='', depth=4): """ Return a string containing a pretty-printed version of this decision tree. Each line in this string corresponds to a single decision tree node or leaf, and indentation is used to display the structure of the decision tree. """ # [xx] display default!! if self._fname is None: n = width-len(prefix)-15 return '%s%s %s\n' % (prefix, '.'*n, self._label) s = '' for i, (fval, result) in enumerate(sorted(self._decisions.items())): hdr = '%s%s=%s? ' % (prefix, self._fname, fval) n = width-15-len(hdr) s += '%s%s %s\n' % (hdr, '.'*(n), result._label) if result._fname is not None and depth>1: s += result.pretty_format(width, prefix+' ', depth-1) if self._default is not None: n = width-len(prefix)-21 s += '%selse: %s %s\n' % (prefix, '.'*n, self._default._label) if self._default._fname is not None and depth>1: s += self._default.pretty_format(width, prefix+' ', depth-1) return s def pseudocode(self, prefix='', depth=4): """ Return a string representation of this decision tree that expresses the decisions it makes as a nested set of pseudocode if statements. """ if self._fname is None: return "%sreturn %r\n" % (prefix, self._label) s = '' for (fval, result) in sorted(self._decisions.items()): s += '%sif %s == %r: ' % (prefix, self._fname, fval) if result._fname is not None and depth>1: s += '\n'+result.pseudocode(prefix+' ', depth-1) else: s += 'return %r\n' % result._label if self._default is not None: if len(self._decisions) == 1: s += '%sif %s != %r: '% (prefix, self._fname, list(self._decisions.keys())[0]) else: s += '%selse: ' % (prefix,) if self._default._fname is not None and depth>1: s += '\n'+self._default.pseudocode(prefix+' ', depth-1) else: s += 'return %r\n' % self._default._label return s def __str__(self): return self.pretty_format() @staticmethod def train(labeled_featuresets, entropy_cutoff=0.05, depth_cutoff=100, support_cutoff=10, binary=False, feature_values=None, verbose=False): """ :param binary: If true, then treat all feature/value pairs as individual binary features, rather than using a single n-way branch for each feature. """ # Collect a list of all feature names. feature_names = set() for featureset, label in labeled_featuresets: for fname in featureset: feature_names.add(fname) # Collect a list of the values each feature can take. if feature_values is None and binary: feature_values = defaultdict(set) for featureset, label in labeled_featuresets: for fname, fval in featureset.items(): feature_values[fname].add(fval) # Start with a stump. if not binary: tree = DecisionTreeClassifier.best_stump( feature_names, labeled_featuresets, verbose) else: tree = DecisionTreeClassifier.best_binary_stump( feature_names, labeled_featuresets, feature_values, verbose) # Refine the stump. tree.refine(labeled_featuresets, entropy_cutoff, depth_cutoff-1, support_cutoff, binary, feature_values, verbose) # Return it return tree @staticmethod def leaf(labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() return DecisionTreeClassifier(label) @staticmethod def stump(feature_name, labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() # Find the best label for each value. freqs = defaultdict(FreqDist) # freq(label|value) for featureset, label in labeled_featuresets: feature_value = featureset.get(feature_name) freqs[feature_value][label] += 1 decisions = dict((val, DecisionTreeClassifier(freqs[val].max())) for val in freqs) return DecisionTreeClassifier(label, feature_name, decisions) def refine(self, labeled_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary=False, feature_values=None, verbose=False): if len(labeled_featuresets) <= support_cutoff: return if self._fname is None: return if depth_cutoff <= 0: return for fval in self._decisions: fval_featuresets = [(featureset, label) for (featureset, label) in labeled_featuresets if featureset.get(self._fname) == fval] label_freqs = FreqDist(label for (featureset, label) in fval_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._decisions[fval] = DecisionTreeClassifier.train( fval_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose) if self._default is not None: default_featuresets = [(featureset, label) for (featureset, label) in labeled_featuresets if featureset.get(self._fname) not in self._decisions] label_freqs = FreqDist(label for (featureset, label) in default_featuresets) if entropy(MLEProbDist(label_freqs)) > entropy_cutoff: self._default = DecisionTreeClassifier.train( default_featuresets, entropy_cutoff, depth_cutoff, support_cutoff, binary, feature_values, verbose) @staticmethod def best_stump(feature_names, labeled_featuresets, verbose=False): best_stump = DecisionTreeClassifier.leaf(labeled_featuresets) best_error = best_stump.error(labeled_featuresets) for fname in feature_names: stump = DecisionTreeClassifier.stump(fname, labeled_featuresets) stump_error = stump.error(labeled_featuresets) if stump_error < best_error: best_error = stump_error best_stump = stump if verbose: print(('best stump for %6d toks uses %-20s err=%6.4f' % (len(labeled_featuresets), best_stump._fname, best_error))) return best_stump @staticmethod def binary_stump(feature_name, feature_value, labeled_featuresets): label = FreqDist(label for (featureset, label) in labeled_featuresets).max() # Find the best label for each value. pos_fdist = FreqDist() neg_fdist = FreqDist() for featureset, label in labeled_featuresets: if featureset.get(feature_name) == feature_value: pos_fdist[label] += 1 else: neg_fdist[label] += 1 decisions = {} default = label # But hopefully we have observations! if pos_fdist.N() > 0: decisions = {feature_value: DecisionTreeClassifier(pos_fdist.max())} if neg_fdist.N() > 0: default = DecisionTreeClassifier(neg_fdist.max()) return DecisionTreeClassifier(label, feature_name, decisions, default) @staticmethod def best_binary_stump(feature_names, labeled_featuresets, feature_values, verbose=False): best_stump = DecisionTreeClassifier.leaf(labeled_featuresets) best_error = best_stump.error(labeled_featuresets) for fname in feature_names: for fval in feature_values[fname]: stump = DecisionTreeClassifier.binary_stump( fname, fval, labeled_featuresets) stump_error = stump.error(labeled_featuresets) if stump_error < best_error: best_error = stump_error best_stump = stump if best_stump._decisions: descr = '%s=%s' % (best_stump._fname, list(best_stump._decisions.keys())[0]) else: descr = '(default)' if verbose: print(('best stump for %6d toks uses %-20s err=%6.4f' % (len(labeled_featuresets), descr, best_error))) return best_stump ##////////////////////////////////////////////////////// ## Demo ##////////////////////////////////////////////////////// def f(x): return DecisionTreeClassifier.train(x, binary=True, verbose=True) def demo(): from nltk.classify.util import names_demo, binary_names_demo_features classifier = names_demo(f, #DecisionTreeClassifier.train, binary_names_demo_features) print(classifier.pp(depth=7)) print(classifier.pseudocode(depth=7)) if __name__ == '__main__': demo() nltk-3.1/nltk/classify/maxent.py0000644000076500000240000016431112607224144016517 0ustar sbstaff00000000000000# Natural Language Toolkit: Maximum Entropy Classifiers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Dmitry Chichkov (TypedMaxentFeatureEncoding) # URL: # For license information, see LICENSE.TXT """ A classifier model based on maximum entropy modeling framework. This framework considers all of the probability distributions that are empirically consistent with the training data; and chooses the distribution with the highest entropy. A probability distribution is "empirically consistent" with a set of training data if its estimated frequency with which a class and a feature vector value co-occur is equal to the actual frequency in the data. Terminology: 'feature' ====================== The term *feature* is usually used to refer to some property of an unlabeled token. For example, when performing word sense disambiguation, we might define a ``'prevword'`` feature whose value is the word preceding the target word. However, in the context of maxent modeling, the term *feature* is typically used to refer to a property of a "labeled" token. In order to prevent confusion, we will introduce two distinct terms to disambiguate these two different concepts: - An "input-feature" is a property of an unlabeled token. - A "joint-feature" is a property of a labeled token. In the rest of the ``nltk.classify`` module, the term "features" is used to refer to what we will call "input-features" in this module. In literature that describes and discusses maximum entropy models, input-features are typically called "contexts", and joint-features are simply referred to as "features". Converting Input-Features to Joint-Features ------------------------------------------- In maximum entropy models, joint-features are required to have numeric values. Typically, each input-feature ``input_feat`` is mapped to a set of joint-features of the form: | joint_feat(token, label) = { 1 if input_feat(token) == feat_val | { and label == some_label | { | { 0 otherwise For all values of ``feat_val`` and ``some_label``. This mapping is performed by classes that implement the ``MaxentFeatureEncodingI`` interface. """ from __future__ import print_function, unicode_literals __docformat__ = 'epytext en' try: import numpy except ImportError: pass import tempfile import os from collections import defaultdict from nltk import compat from nltk.data import gzip_open_unicode from nltk.util import OrderedDict from nltk.probability import DictionaryProbDist from nltk.classify.api import ClassifierI from nltk.classify.util import CutoffChecker, accuracy, log_likelihood from nltk.classify.megam import (call_megam, write_megam_file, parse_megam_weights) from nltk.classify.tadm import call_tadm, write_tadm_file, parse_tadm_weights ###################################################################### #{ Classifier Model ###################################################################### @compat.python_2_unicode_compatible class MaxentClassifier(ClassifierI): """ A maximum entropy classifier (also known as a "conditional exponential classifier"). This classifier is parameterized by a set of "weights", which are used to combine the joint-features that are generated from a featureset by an "encoding". In particular, the encoding maps each ``(featureset, label)`` pair to a vector. The probability of each label is then computed using the following equation:: dotprod(weights, encode(fs,label)) prob(fs|label) = --------------------------------------------------- sum(dotprod(weights, encode(fs,l)) for l in labels) Where ``dotprod`` is the dot product:: dotprod(a,b) = sum(x*y for (x,y) in zip(a,b)) """ def __init__(self, encoding, weights, logarithmic=True): """ Construct a new maxent classifier model. Typically, new classifier models are created using the ``train()`` method. :type encoding: MaxentFeatureEncodingI :param encoding: An encoding that is used to convert the featuresets that are given to the ``classify`` method into joint-feature vectors, which are used by the maxent classifier model. :type weights: list of float :param weights: The feature weight vector for this classifier. :type logarithmic: bool :param logarithmic: If false, then use non-logarithmic weights. """ self._encoding = encoding self._weights = weights self._logarithmic = logarithmic #self._logarithmic = False assert encoding.length() == len(weights) def labels(self): return self._encoding.labels() def set_weights(self, new_weights): """ Set the feature weight vector for this classifier. :param new_weights: The new feature weight vector. :type new_weights: list of float """ self._weights = new_weights assert self._encoding.length() == len(new_weights) def weights(self): """ :return: The feature weight vector for this classifier. :rtype: list of float """ return self._weights def classify(self, featureset): return self.prob_classify(featureset).max() def prob_classify(self, featureset): prob_dict = {} for label in self._encoding.labels(): feature_vector = self._encoding.encode(featureset, label) if self._logarithmic: total = 0.0 for (f_id, f_val) in feature_vector: total += self._weights[f_id] * f_val prob_dict[label] = total else: prod = 1.0 for (f_id, f_val) in feature_vector: prod *= self._weights[f_id] ** f_val prob_dict[label] = prod # Normalize the dictionary to give a probability distribution return DictionaryProbDist(prob_dict, log=self._logarithmic, normalize=True) def explain(self, featureset, columns=4): """ Print a table showing the effect of each of the features in the given feature set, and how they combine to determine the probabilities of each label for that featureset. """ descr_width = 50 TEMPLATE = ' %-'+str(descr_width-2)+'s%s%8.3f' pdist = self.prob_classify(featureset) labels = sorted(pdist.samples(), key=pdist.prob, reverse=True) labels = labels[:columns] print(' Feature'.ljust(descr_width)+''.join( '%8s' % (("%s" % l)[:7]) for l in labels)) print(' '+'-'*(descr_width-2+8*len(labels))) sums = defaultdict(int) for i, label in enumerate(labels): feature_vector = self._encoding.encode(featureset, label) feature_vector.sort(key=lambda fid__: abs(self._weights[fid__[0]]), reverse=True) for (f_id, f_val) in feature_vector: if self._logarithmic: score = self._weights[f_id] * f_val else: score = self._weights[f_id] ** f_val descr = self._encoding.describe(f_id) descr = descr.split(' and label is ')[0] # hack descr += ' (%s)' % f_val # hack if len(descr) > 47: descr = descr[:44]+'...' print(TEMPLATE % (descr, i*8*' ', score)) sums[label] += score print(' '+'-'*(descr_width-1+8*len(labels))) print(' TOTAL:'.ljust(descr_width)+''.join( '%8.3f' % sums[l] for l in labels)) print(' PROBS:'.ljust(descr_width)+''.join( '%8.3f' % pdist.prob(l) for l in labels)) def show_most_informative_features(self, n=10, show='all'): """ :param show: all, neg, or pos (for negative-only or positive-only) """ fids = sorted(list(range(len(self._weights))), key=lambda fid: abs(self._weights[fid]), reverse=True) if show == 'pos': fids = [fid for fid in fids if self._weights[fid] > 0] elif show == 'neg': fids = [fid for fid in fids if self._weights[fid] < 0] for fid in fids[:n]: print('%8.3f %s' % (self._weights[fid], self._encoding.describe(fid))) def __repr__(self): return ('' % (len(self._encoding.labels()), self._encoding.length())) #: A list of the algorithm names that are accepted for the #: ``train()`` method's ``algorithm`` parameter. ALGORITHMS = ['GIS', 'IIS', 'MEGAM', 'TADM'] @classmethod def train(cls, train_toks, algorithm=None, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **cutoffs): """ Train a new maxent classifier based on the given corpus of training samples. This classifier will have its weights chosen to maximize entropy while remaining empirically consistent with the training corpus. :rtype: MaxentClassifier :return: The new maxent classifier :type train_toks: list :param train_toks: Training data, represented as a list of pairs, the first member of which is a featureset, and the second of which is a classification label. :type algorithm: str :param algorithm: A case-insensitive string, specifying which algorithm should be used to train the classifier. The following algorithms are currently available. - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``), Improved Iterative Scaling (``'IIS'``) - External Libraries (requiring megam): LM-BFGS algorithm, with training performed by Megam (``'megam'``) The default algorithm is ``'IIS'``. :type trace: int :param trace: The level of diagnostic tracing output to produce. Higher values produce more verbose output. :type encoding: MaxentFeatureEncodingI :param encoding: A feature encoding, used to convert featuresets into feature vectors. If none is specified, then a ``BinaryMaxentFeatureEncoding`` will be built based on the features that are attested in the training corpus. :type labels: list(str) :param labels: The set of possible labels. If none is given, then the set of all labels attested in the training data will be used instead. :param gaussian_prior_sigma: The sigma value for a gaussian prior on model weights. Currently, this is supported by ``megam``. For other algorithms, its value is ignored. :param cutoffs: Arguments specifying various conditions under which the training should be halted. (Some of the cutoff conditions are not supported by some algorithms.) - ``max_iter=v``: Terminate after ``v`` iterations. - ``min_ll=v``: Terminate after the negative average log-likelihood drops under ``v``. - ``min_lldelta=v``: Terminate if a single iteration improves log likelihood by less than ``v``. """ if algorithm is None: algorithm = 'iis' for key in cutoffs: if key not in ('max_iter', 'min_ll', 'min_lldelta', 'max_acc', 'min_accdelta', 'count_cutoff', 'norm', 'explicit', 'bernoulli'): raise TypeError('Unexpected keyword arg %r' % key) algorithm = algorithm.lower() if algorithm == 'iis': return train_maxent_classifier_with_iis( train_toks, trace, encoding, labels, **cutoffs) elif algorithm == 'gis': return train_maxent_classifier_with_gis( train_toks, trace, encoding, labels, **cutoffs) elif algorithm == 'megam': return train_maxent_classifier_with_megam( train_toks, trace, encoding, labels, gaussian_prior_sigma, **cutoffs) elif algorithm == 'tadm': kwargs = cutoffs kwargs['trace'] = trace kwargs['encoding'] = encoding kwargs['labels'] = labels kwargs['gaussian_prior_sigma'] = gaussian_prior_sigma return TadmMaxentClassifier.train(train_toks, **kwargs) else: raise ValueError('Unknown algorithm %s' % algorithm) #: Alias for MaxentClassifier. ConditionalExponentialClassifier = MaxentClassifier ###################################################################### #{ Feature Encodings ###################################################################### class MaxentFeatureEncodingI(object): """ A mapping that converts a set of input-feature values to a vector of joint-feature values, given a label. This conversion is necessary to translate featuresets into a format that can be used by maximum entropy models. The set of joint-features used by a given encoding is fixed, and each index in the generated joint-feature vectors corresponds to a single joint-feature. The length of the generated joint-feature vectors is therefore constant (for a given encoding). Because the joint-feature vectors generated by ``MaxentFeatureEncodingI`` are typically very sparse, they are represented as a list of ``(index, value)`` tuples, specifying the value of each non-zero joint-feature. Feature encodings are generally created using the ``train()`` method, which generates an appropriate encoding based on the input-feature values and labels that are present in a given corpus. """ def encode(self, featureset, label): """ Given a (featureset, label) pair, return the corresponding vector of joint-feature values. This vector is represented as a list of ``(index, value)`` tuples, specifying the value of each non-zero joint-feature. :type featureset: dict :rtype: list(tuple(int, int)) """ raise NotImplementedError() def length(self): """ :return: The size of the fixed-length joint-feature vectors that are generated by this encoding. :rtype: int """ raise NotImplementedError() def labels(self): """ :return: A list of the \"known labels\" -- i.e., all labels ``l`` such that ``self.encode(fs,l)`` can be a nonzero joint-feature vector for some value of ``fs``. :rtype: list """ raise NotImplementedError() def describe(self, fid): """ :return: A string describing the value of the joint-feature whose index in the generated feature vectors is ``fid``. :rtype: str """ raise NotImplementedError() def train(cls, train_toks): """ Construct and return new feature encoding, based on a given training corpus ``train_toks``. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. """ raise NotImplementedError() class FunctionBackedMaxentFeatureEncoding(MaxentFeatureEncodingI): """ A feature encoding that calls a user-supplied function to map a given featureset/label pair to a sparse joint-feature vector. """ def __init__(self, func, length, labels): """ Construct a new feature encoding based on the given function. :type func: (callable) :param func: A function that takes two arguments, a featureset and a label, and returns the sparse joint feature vector that encodes them:: func(featureset, label) -> feature_vector This sparse joint feature vector (``feature_vector``) is a list of ``(index,value)`` tuples. :type length: int :param length: The size of the fixed-length joint-feature vectors that are generated by this encoding. :type labels: list :param labels: A list of the \"known labels\" for this encoding -- i.e., all labels ``l`` such that ``self.encode(fs,l)`` can be a nonzero joint-feature vector for some value of ``fs``. """ self._length = length self._func = func self._labels = labels def encode(self, featureset, label): return self._func(featureset, label) def length(self): return self._length def labels(self): return self._labels def describe(self, fid): return 'no description available' class BinaryMaxentFeatureEncoding(MaxentFeatureEncodingI): """ A feature encoding that generates vectors containing a binary joint-features of the form: | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) | { | { 0 otherwise Where ``fname`` is the name of an input-feature, ``fval`` is a value for that input-feature, and ``label`` is a label. Typically, these features are constructed based on a training corpus, using the ``train()`` method. This method will create one feature for each combination of ``fname``, ``fval``, and ``label`` that occurs at least once in the training corpus. The ``unseen_features`` parameter can be used to add "unseen-value features", which are used whenever an input feature has a value that was not encountered in the training corpus. These features have the form: | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) | { and l == label | { | { 0 otherwise Where ``is_unseen(fname, fval)`` is true if the encoding does not contain any joint features that are true when ``fs[fname]==fval``. The ``alwayson_features`` parameter can be used to add "always-on features", which have the form:: | joint_feat(fs, l) = { 1 if (l == label) | { | { 0 otherwise These always-on features allow the maxent model to directly model the prior probabilities of each label. """ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): """ :param labels: A list of the \"known labels\" for this encoding. :param mapping: A dictionary mapping from ``(fname,fval,label)`` tuples to corresponding joint-feature indexes. These indexes must be the set of integers from 0...len(mapping). If ``mapping[fname,fval,label]=id``, then ``self.encode(..., fname:fval, ..., label)[id]`` is 1; otherwise, it is 0. :param unseen_features: If true, then include unseen value features in the generated joint-feature vectors. :param alwayson_features: If true, then include always-on features in the generated joint-feature vectors. """ if set(mapping.values()) != set(range(len(mapping))): raise ValueError('Mapping values must be exactly the ' 'set of integers from 0...len(mapping)') self._labels = list(labels) """A list of attested labels.""" self._mapping = mapping """dict mapping from (fname,fval,label) -> fid""" self._length = len(mapping) """The length of generated joint feature vectors.""" self._alwayson = None """dict mapping from label -> fid""" self._unseen = None """dict mapping from fname -> fid""" if alwayson_features: self._alwayson = dict((label, i+self._length) for (i, label) in enumerate(labels)) self._length += len(self._alwayson) if unseen_features: fnames = set(fname for (fname, fval, label) in mapping) self._unseen = dict((fname, i+self._length) for (i, fname) in enumerate(fnames)) self._length += len(fnames) def encode(self, featureset, label): # Inherit docs. encoding = [] # Convert input-features to joint-features: for fname, fval in featureset.items(): # Known feature name & value: if (fname, fval, label) in self._mapping: encoding.append((self._mapping[fname, fval, label], 1)) # Otherwise, we might want to fire an "unseen-value feature". elif self._unseen: # Have we seen this fname/fval combination with any label? for label2 in self._labels: if (fname, fval, label2) in self._mapping: break # we've seen this fname/fval combo # We haven't -- fire the unseen-value feature else: if fname in self._unseen: encoding.append((self._unseen[fname], 1)) # Add always-on features: if self._alwayson and label in self._alwayson: encoding.append((self._alwayson[label], 1)) return encoding def describe(self, f_id): # Inherit docs. if not isinstance(f_id, compat.integer_types): raise TypeError('describe() expected an int') try: self._inv_mapping except AttributeError: self._inv_mapping = [-1]*len(self._mapping) for (info, i) in self._mapping.items(): self._inv_mapping[i] = info if f_id < len(self._mapping): (fname, fval, label) = self._inv_mapping[f_id] return '%s==%r and label is %r' % (fname, fval, label) elif self._alwayson and f_id in self._alwayson.values(): for (label, f_id2) in self._alwayson.items(): if f_id == f_id2: return 'label is %r' % label elif self._unseen and f_id in self._unseen.values(): for (fname, f_id2) in self._unseen.items(): if f_id == f_id2: return '%s is unseen' % fname else: raise ValueError('Bad feature id') def labels(self): # Inherit docs. return self._labels def length(self): # Inherit docs. return self._length @classmethod def train(cls, train_toks, count_cutoff=0, labels=None, **options): """ Construct and return new feature encoding, based on a given training corpus ``train_toks``. See the class description ``BinaryMaxentFeatureEncoding`` for a description of the joint-features that will be included in this encoding. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type count_cutoff: int :param count_cutoff: A cutoff value that is used to discard rare joint-features. If a joint-feature's value is 1 fewer than ``count_cutoff`` times in the training corpus, then that joint-feature is not included in the generated encoding. :type labels: list :param labels: A list of labels that should be used by the classifier. If not specified, then the set of labels attested in ``train_toks`` will be used. :param options: Extra parameters for the constructor, such as ``unseen_features`` and ``alwayson_features``. """ mapping = {} # maps (fname, fval, label) -> fid seen_labels = set() # The set of labels we've encountered count = defaultdict(int) # maps (fname, fval) -> count for (tok, label) in train_toks: if labels and label not in labels: raise ValueError('Unexpected label %s' % label) seen_labels.add(label) # Record each of the features. for (fname, fval) in tok.items(): # If a count cutoff is given, then only add a joint # feature once the corresponding (fname, fval, label) # tuple exceeds that cutoff. count[fname, fval] += 1 if count[fname, fval] >= count_cutoff: if (fname, fval, label) not in mapping: mapping[fname, fval, label] = len(mapping) if labels is None: labels = seen_labels return cls(labels, mapping, **options) class GISEncoding(BinaryMaxentFeatureEncoding): """ A binary feature encoding which adds one new joint-feature to the joint-features defined by ``BinaryMaxentFeatureEncoding``: a correction feature, whose value is chosen to ensure that the sparse vector always sums to a constant non-negative number. This new feature is used to ensure two preconditions for the GIS training algorithm: - At least one feature vector index must be nonzero for every token. - The feature vector must sum to a constant non-negative number for every token. """ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False, C=None): """ :param C: The correction constant. The value of the correction feature is based on this value. In particular, its value is ``C - sum([v for (f,v) in encoding])``. :seealso: ``BinaryMaxentFeatureEncoding.__init__`` """ BinaryMaxentFeatureEncoding.__init__( self, labels, mapping, unseen_features, alwayson_features) if C is None: C = len(set(fname for (fname, fval, label) in mapping))+1 self._C = C @property def C(self): """The non-negative constant that all encoded feature vectors will sum to.""" return self._C def encode(self, featureset, label): # Get the basic encoding. encoding = BinaryMaxentFeatureEncoding.encode(self, featureset, label) base_length = BinaryMaxentFeatureEncoding.length(self) # Add a correction feature. total = sum(v for (f, v) in encoding) if total >= self._C: raise ValueError('Correction feature is not high enough!') encoding.append((base_length, self._C-total)) # Return the result return encoding def length(self): return BinaryMaxentFeatureEncoding.length(self) + 1 def describe(self, f_id): if f_id == BinaryMaxentFeatureEncoding.length(self): return 'Correction feature (%s)' % self._C else: return BinaryMaxentFeatureEncoding.describe(self, f_id) class TadmEventMaxentFeatureEncoding(BinaryMaxentFeatureEncoding): def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): self._mapping = OrderedDict(mapping) self._label_mapping = OrderedDict() BinaryMaxentFeatureEncoding.__init__(self, labels, self._mapping, unseen_features, alwayson_features) def encode(self, featureset, label): encoding = [] for feature, value in featureset.items(): if (feature, label) not in self._mapping: self._mapping[(feature, label)] = len(self._mapping) if value not in self._label_mapping: if not isinstance(value, int): self._label_mapping[value] = len(self._label_mapping) else: self._label_mapping[value] = value encoding.append((self._mapping[(feature, label)], self._label_mapping[value])) return encoding def labels(self): return self._labels def describe(self, fid): for (feature, label) in self._mapping: if self._mapping[(feature, label)] == fid: return (feature, label) def length(self): return len(self._mapping) @classmethod def train(cls, train_toks, count_cutoff=0, labels=None, **options): mapping = OrderedDict() if not labels: labels = [] # This gets read twice, so compute the values in case it's lazy. train_toks = list(train_toks) for (featureset, label) in train_toks: if label not in labels: labels.append(label) for (featureset, label) in train_toks: for label in labels: for feature in featureset: if (feature, label) not in mapping: mapping[(feature, label)] = len(mapping) return cls(labels, mapping, **options) class TypedMaxentFeatureEncoding(MaxentFeatureEncodingI): """ A feature encoding that generates vectors containing integer, float and binary joint-features of the form: Binary (for string and boolean features): | joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label) | { | { 0 otherwise Value (for integer and float features): | joint_feat(fs, l) = { fval if (fs[fname] == type(fval)) | { and (l == label) | { | { not encoded otherwise Where ``fname`` is the name of an input-feature, ``fval`` is a value for that input-feature, and ``label`` is a label. Typically, these features are constructed based on a training corpus, using the ``train()`` method. For string and boolean features [type(fval) not in (int, float)] this method will create one feature for each combination of ``fname``, ``fval``, and ``label`` that occurs at least once in the training corpus. For integer and float features [type(fval) in (int, float)] this method will create one feature for each combination of ``fname`` and ``label`` that occurs at least once in the training corpus. For binary features the ``unseen_features`` parameter can be used to add "unseen-value features", which are used whenever an input feature has a value that was not encountered in the training corpus. These features have the form: | joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname]) | { and l == label | { | { 0 otherwise Where ``is_unseen(fname, fval)`` is true if the encoding does not contain any joint features that are true when ``fs[fname]==fval``. The ``alwayson_features`` parameter can be used to add "always-on features", which have the form: | joint_feat(fs, l) = { 1 if (l == label) | { | { 0 otherwise These always-on features allow the maxent model to directly model the prior probabilities of each label. """ def __init__(self, labels, mapping, unseen_features=False, alwayson_features=False): """ :param labels: A list of the \"known labels\" for this encoding. :param mapping: A dictionary mapping from ``(fname,fval,label)`` tuples to corresponding joint-feature indexes. These indexes must be the set of integers from 0...len(mapping). If ``mapping[fname,fval,label]=id``, then ``self.encode({..., fname:fval, ...``, label)[id]} is 1; otherwise, it is 0. :param unseen_features: If true, then include unseen value features in the generated joint-feature vectors. :param alwayson_features: If true, then include always-on features in the generated joint-feature vectors. """ if set(mapping.values()) != set(range(len(mapping))): raise ValueError('Mapping values must be exactly the ' 'set of integers from 0...len(mapping)') self._labels = list(labels) """A list of attested labels.""" self._mapping = mapping """dict mapping from (fname,fval,label) -> fid""" self._length = len(mapping) """The length of generated joint feature vectors.""" self._alwayson = None """dict mapping from label -> fid""" self._unseen = None """dict mapping from fname -> fid""" if alwayson_features: self._alwayson = dict((label, i+self._length) for (i, label) in enumerate(labels)) self._length += len(self._alwayson) if unseen_features: fnames = set(fname for (fname, fval, label) in mapping) self._unseen = dict((fname, i+self._length) for (i, fname) in enumerate(fnames)) self._length += len(fnames) def encode(self, featureset, label): # Inherit docs. encoding = [] # Convert input-features to joint-features: for fname, fval in featureset.items(): if isinstance(fval, (compat.integer_types, float)): # Known feature name & value: if (fname, type(fval), label) in self._mapping: encoding.append((self._mapping[fname, type(fval), label], fval)) else: # Known feature name & value: if (fname, fval, label) in self._mapping: encoding.append((self._mapping[fname, fval, label], 1)) # Otherwise, we might want to fire an "unseen-value feature". elif self._unseen: # Have we seen this fname/fval combination with any label? for label2 in self._labels: if (fname, fval, label2) in self._mapping: break # we've seen this fname/fval combo # We haven't -- fire the unseen-value feature else: if fname in self._unseen: encoding.append((self._unseen[fname], 1)) # Add always-on features: if self._alwayson and label in self._alwayson: encoding.append((self._alwayson[label], 1)) return encoding def describe(self, f_id): # Inherit docs. if not isinstance(f_id, compat.integer_types): raise TypeError('describe() expected an int') try: self._inv_mapping except AttributeError: self._inv_mapping = [-1]*len(self._mapping) for (info, i) in self._mapping.items(): self._inv_mapping[i] = info if f_id < len(self._mapping): (fname, fval, label) = self._inv_mapping[f_id] return '%s==%r and label is %r' % (fname, fval, label) elif self._alwayson and f_id in self._alwayson.values(): for (label, f_id2) in self._alwayson.items(): if f_id == f_id2: return 'label is %r' % label elif self._unseen and f_id in self._unseen.values(): for (fname, f_id2) in self._unseen.items(): if f_id == f_id2: return '%s is unseen' % fname else: raise ValueError('Bad feature id') def labels(self): # Inherit docs. return self._labels def length(self): # Inherit docs. return self._length @classmethod def train(cls, train_toks, count_cutoff=0, labels=None, **options): """ Construct and return new feature encoding, based on a given training corpus ``train_toks``. See the class description ``TypedMaxentFeatureEncoding`` for a description of the joint-features that will be included in this encoding. Note: recognized feature values types are (int, float), over types are interpreted as regular binary features. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type count_cutoff: int :param count_cutoff: A cutoff value that is used to discard rare joint-features. If a joint-feature's value is 1 fewer than ``count_cutoff`` times in the training corpus, then that joint-feature is not included in the generated encoding. :type labels: list :param labels: A list of labels that should be used by the classifier. If not specified, then the set of labels attested in ``train_toks`` will be used. :param options: Extra parameters for the constructor, such as ``unseen_features`` and ``alwayson_features``. """ mapping = {} # maps (fname, fval, label) -> fid seen_labels = set() # The set of labels we've encountered count = defaultdict(int) # maps (fname, fval) -> count for (tok, label) in train_toks: if labels and label not in labels: raise ValueError('Unexpected label %s' % label) seen_labels.add(label) # Record each of the features. for (fname, fval) in tok.items(): if type(fval) in (int, float): fval = type(fval) # If a count cutoff is given, then only add a joint # feature once the corresponding (fname, fval, label) # tuple exceeds that cutoff. count[fname, fval] += 1 if count[fname, fval] >= count_cutoff: if (fname, fval, label) not in mapping: mapping[fname, fval, label] = len(mapping) if labels is None: labels = seen_labels return cls(labels, mapping, **options) ###################################################################### #{ Classifier Trainer: Generalized Iterative Scaling ###################################################################### def train_maxent_classifier_with_gis(train_toks, trace=3, encoding=None, labels=None, **cutoffs): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the Generalized Iterative Scaling algorithm. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. """ cutoffs.setdefault('max_iter', 100) cutoffchecker = CutoffChecker(cutoffs) # Construct an encoding from the training data. if encoding is None: encoding = GISEncoding.train(train_toks, labels=labels) if not hasattr(encoding, 'C'): raise TypeError('The GIS algorithm requires an encoding that ' 'defines C (e.g., GISEncoding).') # Cinv is the inverse of the sum of each joint feature vector. # This controls the learning rate: higher Cinv (or lower C) gives # faster learning. Cinv = 1.0/encoding.C # Count how many times each feature occurs in the training data. empirical_fcount = calculate_empirical_fcount(train_toks, encoding) # Check for any features that are not attested in train_toks. unattested = set(numpy.nonzero(empirical_fcount == 0)[0]) # Build the classifier. Start with weight=0 for each attested # feature, and weight=-infinity for each unattested feature. weights = numpy.zeros(len(empirical_fcount), 'd') for fid in unattested: weights[fid] = numpy.NINF classifier = ConditionalExponentialClassifier(encoding, weights) # Take the log of the empirical fcount. log_empirical_fcount = numpy.log2(empirical_fcount) del empirical_fcount # Old log-likelihood and accuracy; used to check if the change # in log-likelihood or accuracy is sufficient to indicate convergence. ll_old = None acc_old = None if trace > 0: print(' ==> Training (%d iterations)' % cutoffs['max_iter']) if trace > 2: print() print(' Iteration Log Likelihood Accuracy') print(' ---------------------------------------') # Train the classifier. try: while True: if trace > 2: ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) acc = cutoffchecker.acc or accuracy(classifier, train_toks) iternum = cutoffchecker.iter print(' %9d %14.5f %9.3f' % (iternum, ll, acc)) # Use the model to estimate the number of times each # feature should occur in the training data. estimated_fcount = calculate_estimated_fcount( classifier, train_toks, encoding) # Take the log of estimated fcount (avoid taking log(0).) for fid in unattested: estimated_fcount[fid] += 1 log_estimated_fcount = numpy.log2(estimated_fcount) del estimated_fcount # Update the classifier weights weights = classifier.weights() weights += (log_empirical_fcount - log_estimated_fcount) * Cinv classifier.set_weights(weights) # Check the log-likelihood & accuracy cutoffs. if cutoffchecker.check(classifier, train_toks): break except KeyboardInterrupt: print(' Training stopped: keyboard interrupt') except: raise if trace > 2: ll = log_likelihood(classifier, train_toks) acc = accuracy(classifier, train_toks) print(' Final %14.5f %9.3f' % (ll, acc)) # Return the classifier. return classifier def calculate_empirical_fcount(train_toks, encoding): fcount = numpy.zeros(encoding.length(), 'd') for tok, label in train_toks: for (index, val) in encoding.encode(tok, label): fcount[index] += val return fcount def calculate_estimated_fcount(classifier, train_toks, encoding): fcount = numpy.zeros(encoding.length(), 'd') for tok, label in train_toks: pdist = classifier.prob_classify(tok) for label in pdist.samples(): prob = pdist.prob(label) for (fid, fval) in encoding.encode(tok, label): fcount[fid] += prob*fval return fcount ###################################################################### #{ Classifier Trainer: Improved Iterative Scaling ###################################################################### def train_maxent_classifier_with_iis(train_toks, trace=3, encoding=None, labels=None, **cutoffs): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the Improved Iterative Scaling algorithm. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. """ cutoffs.setdefault('max_iter', 100) cutoffchecker = CutoffChecker(cutoffs) # Construct an encoding from the training data. if encoding is None: encoding = BinaryMaxentFeatureEncoding.train(train_toks, labels=labels) # Count how many times each feature occurs in the training data. empirical_ffreq = (calculate_empirical_fcount(train_toks, encoding) / len(train_toks)) # Find the nf map, and related variables nfarray and nfident. # nf is the sum of the features for a given labeled text. # nfmap compresses this sparse set of values to a dense list. # nfarray performs the reverse operation. nfident is # nfarray multiplied by an identity matrix. nfmap = calculate_nfmap(train_toks, encoding) nfarray = numpy.array(sorted(nfmap, key=nfmap.__getitem__), 'd') nftranspose = numpy.reshape(nfarray, (len(nfarray), 1)) # Check for any features that are not attested in train_toks. unattested = set(numpy.nonzero(empirical_ffreq == 0)[0]) # Build the classifier. Start with weight=0 for each attested # feature, and weight=-infinity for each unattested feature. weights = numpy.zeros(len(empirical_ffreq), 'd') for fid in unattested: weights[fid] = numpy.NINF classifier = ConditionalExponentialClassifier(encoding, weights) if trace > 0: print(' ==> Training (%d iterations)' % cutoffs['max_iter']) if trace > 2: print() print(' Iteration Log Likelihood Accuracy') print(' ---------------------------------------') # Old log-likelihood and accuracy; used to check if the change # in log-likelihood or accuracy is sufficient to indicate convergence. ll_old = None acc_old = None # Train the classifier. try: while True: if trace > 2: ll = cutoffchecker.ll or log_likelihood(classifier, train_toks) acc = cutoffchecker.acc or accuracy(classifier, train_toks) iternum = cutoffchecker.iter print(' %9d %14.5f %9.3f' % (iternum, ll, acc)) # Calculate the deltas for this iteration, using Newton's method. deltas = calculate_deltas( train_toks, classifier, unattested, empirical_ffreq, nfmap, nfarray, nftranspose, encoding) # Use the deltas to update our weights. weights = classifier.weights() weights += deltas classifier.set_weights(weights) # Check the log-likelihood & accuracy cutoffs. if cutoffchecker.check(classifier, train_toks): break except KeyboardInterrupt: print(' Training stopped: keyboard interrupt') except: raise if trace > 2: ll = log_likelihood(classifier, train_toks) acc = accuracy(classifier, train_toks) print(' Final %14.5f %9.3f' % (ll, acc)) # Return the classifier. return classifier def calculate_nfmap(train_toks, encoding): """ Construct a map that can be used to compress ``nf`` (which is typically sparse). *nf(feature_vector)* is the sum of the feature values for *feature_vector*. This represents the number of features that are active for a given labeled text. This method finds all values of *nf(t)* that are attested for at least one token in the given list of training tokens; and constructs a dictionary mapping these attested values to a continuous range *0...N*. For example, if the only values of *nf()* that were attested were 3, 5, and 7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``. :return: A map that can be used to compress ``nf`` to a dense vector. :rtype: dict(int -> int) """ # Map from nf to indices. This allows us to use smaller arrays. nfset = set() for tok, _ in train_toks: for label in encoding.labels(): nfset.add(sum(val for (id, val) in encoding.encode(tok, label))) return dict((nf, i) for (i, nf) in enumerate(nfset)) def calculate_deltas(train_toks, classifier, unattested, ffreq_empirical, nfmap, nfarray, nftranspose, encoding): """ Calculate the update values for the classifier weights for this iteration of IIS. These update weights are the value of ``delta`` that solves the equation:: ffreq_empirical[i] = SUM[fs,l] (classifier.prob_classify(fs).prob(l) * feature_vector(fs,l)[i] * exp(delta[i] * nf(feature_vector(fs,l)))) Where: - *(fs,l)* is a (featureset, label) tuple from ``train_toks`` - *feature_vector(fs,l)* = ``encoding.encode(fs,l)`` - *nf(vector)* = ``sum([val for (id,val) in vector])`` This method uses Newton's method to solve this equation for *delta[i]*. In particular, it starts with a guess of ``delta[i]`` = 1; and iteratively updates ``delta`` with: | delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i]) until convergence, where *sum1* and *sum2* are defined as: | sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta) | sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l))) | f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) . | feature_vector(fs,l)[i] . | exp(delta[i] . nf(feature_vector(fs,l)))) Note that *sum1* and *sum2* depend on ``delta``; so they need to be re-computed each iteration. The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are used to generate a dense encoding for *nf(ltext)*. This allows ``_deltas`` to calculate *sum1* and *sum2* using matrices, which yields a significant performance improvement. :param train_toks: The set of training tokens. :type train_toks: list(tuple(dict, str)) :param classifier: The current classifier. :type classifier: ClassifierI :param ffreq_empirical: An array containing the empirical frequency for each feature. The *i*\ th element of this array is the empirical frequency for feature *i*. :type ffreq_empirical: sequence of float :param unattested: An array that is 1 for features that are not attested in the training data; and 0 for features that are attested. In other words, ``unattested[i]==0`` iff ``ffreq_empirical[i]==0``. :type unattested: sequence of int :param nfmap: A map that can be used to compress ``nf`` to a dense vector. :type nfmap: dict(int -> int) :param nfarray: An array that can be used to uncompress ``nf`` from a dense vector. :type nfarray: array(float) :param nftranspose: The transpose of ``nfarray`` :type nftranspose: array(float) """ # These parameters control when we decide that we've # converged. It probably should be possible to set these # manually, via keyword arguments to train. NEWTON_CONVERGE = 1e-12 MAX_NEWTON = 300 deltas = numpy.ones(encoding.length(), 'd') # Precompute the A matrix: # A[nf][id] = sum ( p(fs) * p(label|fs) * f(fs,label) ) # over all label,fs s.t. num_features[label,fs]=nf A = numpy.zeros((len(nfmap), encoding.length()), 'd') for tok, label in train_toks: dist = classifier.prob_classify(tok) for label in encoding.labels(): # Generate the feature vector feature_vector = encoding.encode(tok, label) # Find the number of active features nf = sum(val for (id, val) in feature_vector) # Update the A matrix for (id, val) in feature_vector: A[nfmap[nf], id] += dist.prob(label) * val A /= len(train_toks) # Iteratively solve for delta. Use the following variables: # - nf_delta[x][y] = nfarray[x] * delta[y] # - exp_nf_delta[x][y] = exp(nf[x] * delta[y]) # - nf_exp_nf_delta[x][y] = nf[x] * exp(nf[x] * delta[y]) # - sum1[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) # exp(delta[i]nf) # - sum2[i][nf] = sum p(fs)p(label|fs)f[i](label,fs) # nf exp(delta[i]nf) for rangenum in range(MAX_NEWTON): nf_delta = numpy.outer(nfarray, deltas) exp_nf_delta = 2 ** nf_delta nf_exp_nf_delta = nftranspose * exp_nf_delta sum1 = numpy.sum(exp_nf_delta * A, axis=0) sum2 = numpy.sum(nf_exp_nf_delta * A, axis=0) # Avoid division by zero. for fid in unattested: sum2[fid] += 1 # Update the deltas. deltas -= (ffreq_empirical - sum1) / -sum2 # We can stop once we converge. n_error = (numpy.sum(abs((ffreq_empirical-sum1)))/ numpy.sum(abs(deltas))) if n_error < NEWTON_CONVERGE: return deltas return deltas ###################################################################### #{ Classifier Trainer: megam ###################################################################### # [xx] possible extension: add support for using implicit file format; # this would need to put requirements on what encoding is used. But # we may need this for other maxent classifier trainers that require # implicit formats anyway. def train_maxent_classifier_with_megam(train_toks, trace=3, encoding=None, labels=None, gaussian_prior_sigma=0, **kwargs): """ Train a new ``ConditionalExponentialClassifier``, using the given training samples, using the external ``megam`` library. This ``ConditionalExponentialClassifier`` will encode the model that maximizes entropy from all the models that are empirically consistent with ``train_toks``. :see: ``train_maxent_classifier()`` for parameter descriptions. :see: ``nltk.classify.megam`` """ explicit = True bernoulli = True if 'explicit' in kwargs: explicit = kwargs['explicit'] if 'bernoulli' in kwargs: bernoulli = kwargs['bernoulli'] # Construct an encoding from the training data. if encoding is None: # Count cutoff can also be controlled by megam with the -minfc # option. Not sure where the best place for it is. count_cutoff = kwargs.get('count_cutoff', 0) encoding = BinaryMaxentFeatureEncoding.train(train_toks, count_cutoff, labels=labels, alwayson_features=True) elif labels is not None: raise ValueError('Specify encoding or labels, not both') # Write a training file for megam. try: fd, trainfile_name = tempfile.mkstemp(prefix='nltk-') with open(trainfile_name, 'w') as trainfile: write_megam_file(train_toks, encoding, trainfile, explicit=explicit, bernoulli=bernoulli) os.close(fd) except (OSError, IOError, ValueError) as e: raise ValueError('Error while creating megam training file: %s' % e) # Run megam on the training file. options = [] options += ['-nobias', '-repeat', '10'] if explicit: options += ['-explicit'] if not bernoulli: options += ['-fvals'] if gaussian_prior_sigma: # Lambda is just the precision of the Gaussian prior, i.e. it's the # inverse variance, so the parameter conversion is 1.0/sigma**2. # See http://www.umiacs.umd.edu/~hal/docs/daume04cg-bfgs.pdf. inv_variance = 1.0 / gaussian_prior_sigma**2 else: inv_variance = 0 options += ['-lambda', '%.2f' % inv_variance, '-tune'] if trace < 3: options += ['-quiet'] if 'max_iter' in kwargs: options += ['-maxi', '%s' % kwargs['max_iter']] if 'll_delta' in kwargs: # [xx] this is actually a perplexity delta, not a log # likelihood delta options += ['-dpp', '%s' % abs(kwargs['ll_delta'])] if hasattr(encoding, 'cost'): options += ['-multilabel'] # each possible la options += ['multiclass', trainfile_name] stdout = call_megam(options) # print './megam_i686.opt ', ' '.join(options) # Delete the training file try: os.remove(trainfile_name) except (OSError, IOError) as e: print('Warning: unable to delete %s: %s' % (trainfile_name, e)) # Parse the generated weight vector. weights = parse_megam_weights(stdout, encoding.length(), explicit) # Convert from base-e to base-2 weights. weights *= numpy.log2(numpy.e) # Build the classifier return MaxentClassifier(encoding, weights) ###################################################################### #{ Classifier Trainer: tadm ###################################################################### class TadmMaxentClassifier(MaxentClassifier): @classmethod def train(cls, train_toks, **kwargs): algorithm = kwargs.get('algorithm', 'tao_lmvm') trace = kwargs.get('trace', 3) encoding = kwargs.get('encoding', None) labels = kwargs.get('labels', None) sigma = kwargs.get('gaussian_prior_sigma', 0) count_cutoff = kwargs.get('count_cutoff', 0) max_iter = kwargs.get('max_iter') ll_delta = kwargs.get('min_lldelta') # Construct an encoding from the training data. if not encoding: encoding = TadmEventMaxentFeatureEncoding.train(train_toks, count_cutoff, labels=labels) trainfile_fd, trainfile_name = \ tempfile.mkstemp(prefix='nltk-tadm-events-', suffix='.gz') weightfile_fd, weightfile_name = \ tempfile.mkstemp(prefix='nltk-tadm-weights-') trainfile = gzip_open_unicode(trainfile_name, 'w') write_tadm_file(train_toks, encoding, trainfile) trainfile.close() options = [] options.extend(['-monitor']) options.extend(['-method', algorithm]) if sigma: options.extend(['-l2', '%.6f' % sigma**2]) if max_iter: options.extend(['-max_it', '%d' % max_iter]) if ll_delta: options.extend(['-fatol', '%.6f' % abs(ll_delta)]) options.extend(['-events_in', trainfile_name]) options.extend(['-params_out', weightfile_name]) if trace < 3: options.extend(['2>&1']) else: options.extend(['-summary']) call_tadm(options) with open(weightfile_name, 'r') as weightfile: weights = parse_tadm_weights(weightfile) os.remove(trainfile_name) os.remove(weightfile_name) # Convert from base-e to base-2 weights. weights *= numpy.log2(numpy.e) # Build the classifier return cls(encoding, weights) ###################################################################### #{ Demo ###################################################################### def demo(): from nltk.classify.util import names_demo classifier = names_demo(MaxentClassifier.train) if __name__ == '__main__': demo() nltk-3.1/nltk/classify/megam.py0000644000076500000240000001436012607224144016307 0ustar sbstaff00000000000000# Natural Language Toolkit: Interface to Megam Classifier # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A set of functions used to interface with the external megam_ maxent optimization package. Before megam can be used, you should tell NLTK where it can find the megam binary, using the ``config_megam()`` function. Typical usage: >>> from nltk.classify import megam >>> megam.config_megam() # pass path to megam if not found in PATH # doctest: +SKIP [Found megam: ...] Use with MaxentClassifier. Example below, see MaxentClassifier documentation for details. nltk.classify.MaxentClassifier.train(corpus, 'megam') .. _megam: http://www.umiacs.umd.edu/~hal/megam/index.html """ from __future__ import print_function import subprocess from nltk import compat from nltk.internals import find_binary try: import numpy except ImportError: numpy = None ###################################################################### #{ Configuration ###################################################################### _megam_bin = None def config_megam(bin=None): """ Configure NLTK's interface to the ``megam`` maxent optimization package. :param bin: The full path to the ``megam`` binary. If not specified, then nltk will search the system for a ``megam`` binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str """ global _megam_bin _megam_bin = find_binary( 'megam', bin, env_vars=['MEGAM'], binary_names=['megam.opt', 'megam', 'megam_686', 'megam_i686.opt'], url='http://www.umiacs.umd.edu/~hal/megam/index.html') ###################################################################### #{ Megam Interface Functions ###################################################################### def write_megam_file(train_toks, encoding, stream, bernoulli=True, explicit=True): """ Generate an input file for ``megam`` based on the given corpus of classified tokens. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type encoding: MaxentFeatureEncodingI :param encoding: A feature encoding, used to convert featuresets into feature vectors. May optionally implement a cost() method in order to assign different costs to different class predictions. :type stream: stream :param stream: The stream to which the megam input file should be written. :param bernoulli: If true, then use the 'bernoulli' format. I.e., all joint features have binary values, and are listed iff they are true. Otherwise, list feature values explicitly. If ``bernoulli=False``, then you must call ``megam`` with the ``-fvals`` option. :param explicit: If true, then use the 'explicit' format. I.e., list the features that would fire for any of the possible labels, for each token. If ``explicit=True``, then you must call ``megam`` with the ``-explicit`` option. """ # Look up the set of labels. labels = encoding.labels() labelnum = dict((label, i) for (i, label) in enumerate(labels)) # Write the file, which contains one line per instance. for featureset, label in train_toks: # First, the instance number (or, in the weighted multiclass case, the cost of each label). if hasattr(encoding, 'cost'): stream.write(':'.join(str(encoding.cost(featureset, label, l)) for l in labels)) else: stream.write('%d' % labelnum[label]) # For implicit file formats, just list the features that fire # for this instance's actual label. if not explicit: _write_megam_features(encoding.encode(featureset, label), stream, bernoulli) # For explicit formats, list the features that would fire for # any of the possible labels. else: for l in labels: stream.write(' #') _write_megam_features(encoding.encode(featureset, l), stream, bernoulli) # End of the instance. stream.write('\n') def parse_megam_weights(s, features_count, explicit=True): """ Given the stdout output generated by ``megam`` when training a model, return a ``numpy`` array containing the corresponding weight vector. This function does not currently handle bias features. """ if numpy is None: raise ValueError('This function requires that numpy be installed') assert explicit, 'non-explicit not supported yet' lines = s.strip().split('\n') weights = numpy.zeros(features_count, 'd') for line in lines: if line.strip(): fid, weight = line.split() weights[int(fid)] = float(weight) return weights def _write_megam_features(vector, stream, bernoulli): if not vector: raise ValueError('MEGAM classifier requires the use of an ' 'always-on feature.') for (fid, fval) in vector: if bernoulli: if fval == 1: stream.write(' %s' % fid) elif fval != 0: raise ValueError('If bernoulli=True, then all' 'features must be binary.') else: stream.write(' %s %s' % (fid, fval)) def call_megam(args): """ Call the ``megam`` binary with the given arguments. """ if isinstance(args, compat.string_types): raise TypeError('args should be a list of strings') if _megam_bin is None: config_megam() # Call megam via a subprocess cmd = [_megam_bin] + args p = subprocess.Popen(cmd, stdout=subprocess.PIPE) (stdout, stderr) = p.communicate() # Check the return code. if p.returncode != 0: print() print(stderr) raise OSError('megam command failed!') if isinstance(stdout, compat.string_types): return stdout else: return stdout.decode('utf-8') nltk-3.1/nltk/classify/naivebayes.py0000644000076500000240000002341312607224144017346 0ustar sbstaff00000000000000# Natural Language Toolkit: Naive Bayes Classifiers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ A classifier based on the Naive Bayes algorithm. In order to find the probability for a label, this algorithm first uses the Bayes rule to express P(label|features) in terms of P(label) and P(features|label): | P(label) * P(features|label) | P(label|features) = ------------------------------ | P(features) The algorithm then makes the 'naive' assumption that all features are independent, given the label: | P(label) * P(f1|label) * ... * P(fn|label) | P(label|features) = -------------------------------------------- | P(features) Rather than computing P(featues) explicitly, the algorithm just calculates the denominator for each label, and normalizes them so they sum to one: | P(label) * P(f1|label) * ... * P(fn|label) | P(label|features) = -------------------------------------------- | SUM[l]( P(l) * P(f1|l) * ... * P(fn|l) ) """ from __future__ import print_function, unicode_literals from collections import defaultdict from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist, sum_logs from nltk.classify.api import ClassifierI ##////////////////////////////////////////////////////// ## Naive Bayes Classifier ##////////////////////////////////////////////////////// class NaiveBayesClassifier(ClassifierI): """ A Naive Bayes classifier. Naive Bayes classifiers are paramaterized by two probability distributions: - P(label) gives the probability that an input will receive each label, given no information about the input's features. - P(fname=fval|label) gives the probability that a given feature (fname) will receive a given value (fval), given that the label (label). If the classifier encounters an input with a feature that has never been seen with any label, then rather than assigning a probability of 0 to all labels, it will ignore that feature. The feature value 'None' is reserved for unseen feature values; you generally should not use 'None' as a feature value for one of your own features. """ def __init__(self, label_probdist, feature_probdist): """ :param label_probdist: P(label), the probability distribution over labels. It is expressed as a ``ProbDistI`` whose samples are labels. I.e., P(label) = ``label_probdist.prob(label)``. :param feature_probdist: P(fname=fval|label), the probability distribution for feature values, given labels. It is expressed as a dictionary whose keys are ``(label, fname)`` pairs and whose values are ``ProbDistI`` objects over feature values. I.e., P(fname=fval|label) = ``feature_probdist[label,fname].prob(fval)``. If a given ``(label,fname)`` is not a key in ``feature_probdist``, then it is assumed that the corresponding P(fname=fval|label) is 0 for all values of ``fval``. """ self._label_probdist = label_probdist self._feature_probdist = feature_probdist self._labels = list(label_probdist.samples()) def labels(self): return self._labels def classify(self, featureset): return self.prob_classify(featureset).max() def prob_classify(self, featureset): # Discard any feature names that we've never seen before. # Otherwise, we'll just assign a probability of 0 to # everything. featureset = featureset.copy() for fname in list(featureset.keys()): for label in self._labels: if (label, fname) in self._feature_probdist: break else: #print 'Ignoring unseen feature %s' % fname del featureset[fname] # Find the log probabilty of each label, given the features. # Start with the log probability of the label itself. logprob = {} for label in self._labels: logprob[label] = self._label_probdist.logprob(label) # Then add in the log probability of features given labels. for label in self._labels: for (fname, fval) in featureset.items(): if (label, fname) in self._feature_probdist: feature_probs = self._feature_probdist[label, fname] logprob[label] += feature_probs.logprob(fval) else: # nb: This case will never come up if the # classifier was created by # NaiveBayesClassifier.train(). logprob[label] += sum_logs([]) # = -INF. return DictionaryProbDist(logprob, normalize=True, log=True) def show_most_informative_features(self, n=10): # Determine the most relevant features, and display them. cpdist = self._feature_probdist print('Most Informative Features') for (fname, fval) in self.most_informative_features(n): def labelprob(l): return cpdist[l, fname].prob(fval) labels = sorted([l for l in self._labels if fval in cpdist[l, fname].samples()], key=labelprob) if len(labels) == 1: continue l0 = labels[0] l1 = labels[-1] if cpdist[l0, fname].prob(fval) == 0: ratio = 'INF' else: ratio = '%8.1f' % (cpdist[l1, fname].prob(fval) / cpdist[l0, fname].prob(fval)) print(('%24s = %-14r %6s : %-6s = %s : 1.0' % (fname, fval, ("%s" % l1)[:6], ("%s" % l0)[:6], ratio))) def most_informative_features(self, n=100): """ Return a list of the 'most informative' features used by this classifier. For the purpose of this function, the informativeness of a feature ``(fname,fval)`` is equal to the highest value of P(fname=fval|label), for any label, divided by the lowest value of P(fname=fval|label), for any label: | max[ P(fname=fval|label1) / P(fname=fval|label2) ] """ # The set of (fname, fval) pairs used by this classifier. features = set() # The max & min probability associated w/ each (fname, fval) # pair. Maps (fname,fval) -> float. maxprob = defaultdict(lambda: 0.0) minprob = defaultdict(lambda: 1.0) for (label, fname), probdist in self._feature_probdist.items(): for fval in probdist.samples(): feature = (fname, fval) features.add(feature) p = probdist.prob(fval) maxprob[feature] = max(p, maxprob[feature]) minprob[feature] = min(p, minprob[feature]) if minprob[feature] == 0: features.discard(feature) # Convert features to a list, & sort it by how informative # features are. features = sorted(features, key=lambda feature_: minprob[feature_]/maxprob[feature_]) return features[:n] @classmethod def train(cls, labeled_featuresets, estimator=ELEProbDist): """ :param labeled_featuresets: A list of classified featuresets, i.e., a list of tuples ``(featureset, label)``. """ label_freqdist = FreqDist() feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() # Count up how many times each feature value occurred, given # the label and featurename. for featureset, label in labeled_featuresets: label_freqdist[label] += 1 for fname, fval in featureset.items(): # Increment freq(fval|label, fname) feature_freqdist[label, fname][fval] += 1 # Record that fname can take the value fval. feature_values[fname].add(fval) # Keep a list of all feature names. fnames.add(fname) # If a feature didn't have a value given for an instance, then # we assume that it gets the implicit value 'None.' This loop # counts up the number of 'missing' feature values for each # (label,fname) pair, and increments the count of the fval # 'None' by that amount. for label in label_freqdist: num_samples = label_freqdist[label] for fname in fnames: count = feature_freqdist[label, fname].N() # Only add a None key when necessary, i.e. if there are # any samples with feature 'fname' missing. if num_samples - count > 0: feature_freqdist[label, fname][None] += num_samples - count feature_values[fname].add(None) # Create the P(label) distribution label_probdist = estimator(label_freqdist) # Create the P(fval|label, fname) distribution feature_probdist = {} for ((label, fname), freqdist) in feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[label, fname] = probdist return cls(label_probdist, feature_probdist) ##////////////////////////////////////////////////////// ## Demo ##////////////////////////////////////////////////////// def demo(): from nltk.classify.util import names_demo classifier = names_demo(NaiveBayesClassifier.train) classifier.show_most_informative_features() if __name__ == '__main__': demo() nltk-3.1/nltk/classify/positivenaivebayes.py0000644000076500000240000001611512607224144021132 0ustar sbstaff00000000000000# Natural Language Toolkit: Positive Naive Bayes Classifier # # Copyright (C) 2012 NLTK Project # Author: Alessandro Presta # URL: # For license information, see LICENSE.TXT """ A variant of the Naive Bayes Classifier that performs binary classification with partially-labeled training sets. In other words, assume we want to build a classifier that assigns each example to one of two complementary classes (e.g., male names and female names). If we have a training set with labeled examples for both classes, we can use a standard Naive Bayes Classifier. However, consider the case when we only have labeled examples for one of the classes, and other, unlabeled, examples. Then, assuming a prior distribution on the two labels, we can use the unlabeled set to estimate the frequencies of the various features. Let the two possible labels be 1 and 0, and let's say we only have examples labeled 1 and unlabeled examples. We are also given an estimate of P(1). We compute P(feature|1) exactly as in the standard case. To compute P(feature|0), we first estimate P(feature) from the unlabeled set (we are assuming that the unlabeled examples are drawn according to the given prior distribution) and then express the conditional probability as: | P(feature) - P(feature|1) * P(1) | P(feature|0) = ---------------------------------- | P(0) Example: >>> from nltk.classify import PositiveNaiveBayesClassifier Some sentences about sports: >>> sports_sentences = [ 'The team dominated the game', ... 'They lost the ball', ... 'The game was intense', ... 'The goalkeeper catched the ball', ... 'The other team controlled the ball' ] Mixed topics, including sports: >>> various_sentences = [ 'The President did not comment', ... 'I lost the keys', ... 'The team won the game', ... 'Sara has two kids', ... 'The ball went off the court', ... 'They had the ball for the whole game', ... 'The show is over' ] The features of a sentence are simply the words it contains: >>> def features(sentence): ... words = sentence.lower().split() ... return dict(('contains(%s)' % w, True) for w in words) We use the sports sentences as positive examples, the mixed ones ad unlabeled examples: >>> positive_featuresets = list(map(features, sports_sentences)) >>> unlabeled_featuresets = list(map(features, various_sentences)) >>> classifier = PositiveNaiveBayesClassifier.train(positive_featuresets, ... unlabeled_featuresets) Is the following sentence about sports? >>> classifier.classify(features('The cat is on the table')) False What about this one? >>> classifier.classify(features('My team lost the game')) True """ from collections import defaultdict from nltk.probability import FreqDist, DictionaryProbDist, ELEProbDist from nltk.classify.naivebayes import NaiveBayesClassifier ##////////////////////////////////////////////////////// ## Positive Naive Bayes Classifier ##////////////////////////////////////////////////////// class PositiveNaiveBayesClassifier(NaiveBayesClassifier): @staticmethod def train(positive_featuresets, unlabeled_featuresets, positive_prob_prior=0.5, estimator=ELEProbDist): """ :param positive_featuresets: A list of featuresets that are known as positive examples (i.e., their label is ``True``). :param unlabeled_featuresets: A list of featuresets whose label is unknown. :param positive_prob_prior: A prior estimate of the probability of the label ``True`` (default 0.5). """ positive_feature_freqdist = defaultdict(FreqDist) unlabeled_feature_freqdist = defaultdict(FreqDist) feature_values = defaultdict(set) fnames = set() # Count up how many times each feature value occurred in positive examples. for featureset in positive_featuresets: for fname, fval in featureset.items(): positive_feature_freqdist[fname][fval] += 1 feature_values[fname].add(fval) fnames.add(fname) # Count up how many times each feature value occurred in unlabeled examples. for featureset in unlabeled_featuresets: for fname, fval in featureset.items(): unlabeled_feature_freqdist[fname][fval] += 1 feature_values[fname].add(fval) fnames.add(fname) # If a feature didn't have a value given for an instance, then we assume that # it gets the implicit value 'None'. num_positive_examples = len(positive_featuresets) for fname in fnames: count = positive_feature_freqdist[fname].N() positive_feature_freqdist[fname][None] += num_positive_examples - count feature_values[fname].add(None) num_unlabeled_examples = len(unlabeled_featuresets) for fname in fnames: count = unlabeled_feature_freqdist[fname].N() unlabeled_feature_freqdist[fname][None] += num_unlabeled_examples - count feature_values[fname].add(None) negative_prob_prior = 1.0 - positive_prob_prior # Create the P(label) distribution. label_probdist = DictionaryProbDist({True: positive_prob_prior, False: negative_prob_prior}) # Create the P(fval|label, fname) distribution. feature_probdist = {} for fname, freqdist in positive_feature_freqdist.items(): probdist = estimator(freqdist, bins=len(feature_values[fname])) feature_probdist[True, fname] = probdist for fname, freqdist in unlabeled_feature_freqdist.items(): global_probdist = estimator(freqdist, bins=len(feature_values[fname])) negative_feature_probs = {} for fval in feature_values[fname]: prob = (global_probdist.prob(fval) - positive_prob_prior * feature_probdist[True, fname].prob(fval)) \ / negative_prob_prior # TODO: We need to add some kind of smoothing here, instead of # setting negative probabilities to zero and normalizing. negative_feature_probs[fval] = max(prob, 0.0) feature_probdist[False, fname] = DictionaryProbDist(negative_feature_probs, normalize=True) return PositiveNaiveBayesClassifier(label_probdist, feature_probdist) ##////////////////////////////////////////////////////// ## Demo ##////////////////////////////////////////////////////// def demo(): from nltk.classify.util import partial_names_demo classifier = partial_names_demo(PositiveNaiveBayesClassifier.train) classifier.show_most_informative_features() nltk-3.1/nltk/classify/rte_classify.py0000644000076500000240000001451512607224144017712 0ustar sbstaff00000000000000# Natural Language Toolkit: RTE Classifier # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ Simple classifier for RTE corpus. It calculates the overlap in words and named entities between text and hypothesis, and also whether there are words / named entities in the hypothesis which fail to occur in the text, since this is an indicator that the hypothesis is more informative than (i.e not entailed by) the text. TO DO: better Named Entity classification TO DO: add lemmatization """ from __future__ import print_function import nltk from nltk.classify.util import accuracy def ne(token): """ This just assumes that words in all caps or titles are named entities. :type token: str """ if token.istitle() or token.isupper(): return True return False def lemmatize(word): """ Use morphy from WordNet to find the base form of verbs. """ lemma = nltk.corpus.wordnet.morphy(word, pos=nltk.corpus.wordnet.VERB) if lemma is not None: return lemma return word class RTEFeatureExtractor(object): """ This builds a bag of words for both the text and the hypothesis after throwing away some stopwords, then calculates overlap and difference. """ def __init__(self, rtepair, stop=True, lemmatize=False): """ :param rtepair: a ``RTEPair`` from which features should be extracted :param stop: if ``True``, stopwords are thrown away. :type stop: bool """ self.stop = stop self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is', 'have', 'are', 'were', 'and', 'very', '.', ',']) self.negwords = set(['no', 'not', 'never', 'failed', 'rejected', 'denied']) # Try to tokenize so that abbreviations like U.S.and monetary amounts # like "$23.00" are kept as tokens. from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+') #Get the set of word types for text and hypothesis self.text_tokens = tokenizer.tokenize(rtepair.text) self.hyp_tokens = tokenizer.tokenize(rtepair.hyp) self.text_words = set(self.text_tokens) self.hyp_words = set(self.hyp_tokens) if lemmatize: self.text_words = set(lemmatize(token) for token in self.text_tokens) self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens) if self.stop: self.text_words = self.text_words - self.stopwords self.hyp_words = self.hyp_words - self.stopwords self._overlap = self.hyp_words & self.text_words self._hyp_extra = self.hyp_words - self.text_words self._txt_extra = self.text_words - self.hyp_words def overlap(self, toktype, debug=False): """ Compute the overlap between text and hypothesis. :param toktype: distinguish Named Entities from ordinary words :type toktype: 'ne' or 'word' """ ne_overlap = set(token for token in self._overlap if ne(token)) if toktype == 'ne': if debug: print("ne overlap", ne_overlap) return ne_overlap elif toktype == 'word': if debug: print("word overlap", self._overlap - ne_overlap) return self._overlap - ne_overlap else: raise ValueError("Type not recognized:'%s'" % toktype) def hyp_extra(self, toktype, debug=True): """ Compute the extraneous material in the hypothesis. :param toktype: distinguish Named Entities from ordinary words :type toktype: 'ne' or 'word' """ ne_extra = set(token for token in self._hyp_extra if ne(token)) if toktype == 'ne': return ne_extra elif toktype == 'word': return self._hyp_extra - ne_extra else: raise ValueError("Type not recognized: '%s'" % toktype) def rte_features(rtepair): extractor = RTEFeatureExtractor(rtepair) features = {} features['alwayson'] = True features['word_overlap'] = len(extractor.overlap('word')) features['word_hyp_extra'] = len(extractor.hyp_extra('word')) features['ne_overlap'] = len(extractor.overlap('ne')) features['ne_hyp_extra'] = len(extractor.hyp_extra('ne')) features['neg_txt'] = len(extractor.negwords & extractor.text_words) features['neg_hyp'] = len(extractor.negwords & extractor.hyp_words) return features def rte_classifier(trainer, features=rte_features): """ Classify RTEPairs """ train = ((pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_dev.xml', 'rte2_dev.xml', 'rte3_dev.xml'])) test = ((pair, pair.value) for pair in nltk.corpus.rte.pairs(['rte1_test.xml', 'rte2_test.xml', 'rte3_test.xml'])) # Train up a classifier. print('Training classifier...') classifier = trainer([(features(pair), label) for (pair, label) in train]) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(pair), label) for (pair, label) in test]) print('Accuracy: %6.4f' % acc) # Return the classifier return classifier def demo_features(): pairs = nltk.corpus.rte.pairs(['rte1_dev.xml'])[:6] for pair in pairs: print() for key in sorted(rte_features(pair)): print("%-15s => %s" % (key, rte_features(pair)[key])) def demo_feature_extractor(): rtepair = nltk.corpus.rte.pairs(['rte3_dev.xml'])[33] extractor = RTEFeatureExtractor(rtepair) print(extractor.hyp_words) print(extractor.overlap('word')) print(extractor.overlap('ne')) print(extractor.hyp_extra('word')) def demo(): import nltk try: nltk.config_megam('/usr/local/bin/megam') trainer = lambda x: nltk.MaxentClassifier.train(x, 'megam') except ValueError: try: trainer = lambda x: nltk.MaxentClassifier.train(x, 'BFGS') except ValueError: trainer = nltk.MaxentClassifier.train nltk.classify.rte_classifier(trainer) if __name__ == '__main__': demo_features() demo_feature_extractor() demo() nltk-3.1/nltk/classify/scikitlearn.py0000644000076500000240000001321112607224144017523 0ustar sbstaff00000000000000# Natural Language Toolkit: Interface to scikit-learn classifiers # # Author: Lars Buitinck # URL: # For license information, see LICENSE.TXT """ scikit-learn (http://scikit-learn.org) is a machine learning library for Python. It supports many classification algorithms, including SVMs, Naive Bayes, logistic regression (MaxEnt) and decision trees. This package implement a wrapper around scikit-learn classifiers. To use this wrapper, construct a scikit-learn estimator object, then use that to construct a SklearnClassifier. E.g., to wrap a linear SVM with default settings: >>> from sklearn.svm import LinearSVC >>> from nltk.classify.scikitlearn import SklearnClassifier >>> classif = SklearnClassifier(LinearSVC()) A scikit-learn classifier may include preprocessing steps when it's wrapped in a Pipeline object. The following constructs and wraps a Naive Bayes text classifier with tf-idf weighting and chi-square feature selection to get the best 1000 features: >>> from sklearn.feature_extraction.text import TfidfTransformer >>> from sklearn.feature_selection import SelectKBest, chi2 >>> from sklearn.naive_bayes import MultinomialNB >>> from sklearn.pipeline import Pipeline >>> pipeline = Pipeline([('tfidf', TfidfTransformer()), ... ('chi2', SelectKBest(chi2, k=1000)), ... ('nb', MultinomialNB())]) >>> classif = SklearnClassifier(pipeline) """ from __future__ import print_function, unicode_literals from nltk.classify.api import ClassifierI from nltk.probability import DictionaryProbDist from nltk import compat try: from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import LabelEncoder except ImportError: pass __all__ = ['SklearnClassifier'] @compat.python_2_unicode_compatible class SklearnClassifier(ClassifierI): """Wrapper for scikit-learn classifiers.""" def __init__(self, estimator, dtype=float, sparse=True): """ :param estimator: scikit-learn classifier object. :param dtype: data type used when building feature array. scikit-learn estimators work exclusively on numeric data. The default value should be fine for almost all situations. :param sparse: Whether to use sparse matrices internally. The estimator must support these; not all scikit-learn classifiers do (see their respective documentation and look for "sparse matrix"). The default value is True, since most NLP problems involve sparse feature sets. Setting this to False may take a great amount of memory. :type sparse: boolean. """ self._clf = estimator self._encoder = LabelEncoder() self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse) def __repr__(self): return "" % self._clf def classify_many(self, featuresets): """Classify a batch of samples. :param featuresets: An iterable over featuresets, each a dict mapping strings to either numbers, booleans or strings. :return: The predicted class label for each input sample. :rtype: list """ X = self._vectorizer.transform(featuresets) classes = self._encoder.classes_ return [classes[i] for i in self._clf.predict(X)] def prob_classify_many(self, featuresets): """Compute per-class probabilities for a batch of samples. :param featuresets: An iterable over featuresets, each a dict mapping strings to either numbers, booleans or strings. :rtype: list of ``ProbDistI`` """ X = self._vectorizer.transform(featuresets) y_proba_list = self._clf.predict_proba(X) return [self._make_probdist(y_proba) for y_proba in y_proba_list] def labels(self): """The class labels used by this classifier. :rtype: list """ return list(self._encoder.classes_) def train(self, labeled_featuresets): """ Train (fit) the scikit-learn estimator. :param labeled_featuresets: A list of ``(featureset, label)`` where each ``featureset`` is a dict mapping strings to either numbers, booleans or strings. """ X, y = list(compat.izip(*labeled_featuresets)) X = self._vectorizer.fit_transform(X) y = self._encoder.fit_transform(y) self._clf.fit(X, y) return self def _make_probdist(self, y_proba): classes = self._encoder.classes_ return DictionaryProbDist(dict((classes[i], p) for i, p in enumerate(y_proba))) # skip doctests if scikit-learn is not installed def setup_module(module): from nose import SkipTest try: import sklearn except ImportError: raise SkipTest("scikit-learn is not installed") if __name__ == "__main__": from nltk.classify.util import names_demo, names_demo_features from sklearn.linear_model import LogisticRegression from sklearn.naive_bayes import BernoulliNB # Bernoulli Naive Bayes is designed for binary classification. We set the # binarize option to False since we know we're passing boolean features. print("scikit-learn Naive Bayes:") names_demo(SklearnClassifier(BernoulliNB(binarize=False)).train, features=names_demo_features) # The C parameter on logistic regression (MaxEnt) controls regularization. # The higher it's set, the less regularized the classifier is. print("\n\nscikit-learn logistic regression:") names_demo(SklearnClassifier(LogisticRegression(C=1000)).train, features=names_demo_features) nltk-3.1/nltk/classify/senna.py0000644000076500000240000001551112607224144016324 0ustar sbstaff00000000000000# encoding: utf-8 # Natural Language Toolkit: Senna Interface # # Copyright (C) 2001-2015 NLTK Project # Author: Rami Al-Rfou' # URL: # For license information, see LICENSE.TXT """ A general interface to the SENNA pipeline that supports any of the operations specified in SUPPORTED_OPERATIONS. Applying multiple operations at once has the speed advantage. For example, Senna will automatically determine POS tags if you are extracting named entities. Applying both of the operations will cost only the time of extracting the named entities. The SENNA pipeline has a fixed maximum size of the sentences that it can read. By default it is 1024 token/sentence. If you have larger sentences, changing the MAX_SENTENCE_SIZE value in SENNA_main.c should be considered and your system specific binary should be rebuilt. Otherwise this could introduce misalignment errors. The input is: - path to the directory that contains SENNA executables. If the path is incorrect, Senna will automatically search for executable file specified in SENNA environment variable - List of the operations needed to be performed. - (optionally) the encoding of the input data (default:utf-8) >>> from __future__ import unicode_literals >>> from nltk.classify import Senna >>> pipeline = Senna('/usr/share/senna-v2.0', ['pos', 'chk', 'ner']) >>> sent = 'Dusseldorf is an international business center'.split() >>> [(token['word'], token['chk'], token['ner'], token['pos']) for token in pipeline.tag(sent)] [('Dusseldorf', 'B-NP', 'B-LOC', 'NNP'), ('is', 'B-VP', 'O', 'VBZ'), ('an', 'B-NP', 'O', 'DT'), ('international', 'I-NP', 'O', 'JJ'), ('business', 'I-NP', 'O', 'NN'), ('center', 'I-NP', 'O', 'NN')] """ from __future__ import unicode_literals from os import path, sep, environ from subprocess import Popen, PIPE from platform import architecture, system from nltk.tag.api import TaggerI from nltk.compat import text_type, python_2_unicode_compatible _senna_url = 'http://ml.nec-labs.com/senna/' @python_2_unicode_compatible class Senna(TaggerI): SUPPORTED_OPERATIONS = ['pos', 'chk', 'ner'] def __init__(self, senna_path, operations, encoding='utf-8'): self._encoding = encoding self._path = path.normpath(senna_path) + sep # Verifies the existence of the executable on the self._path first #senna_binary_file_1 = self.executable(self._path) exe_file_1 = self.executable(self._path) if not path.isfile(exe_file_1): # Check for the system environment if 'SENNA' in environ: #self._path = path.join(environ['SENNA'],'') self._path = path.normpath(environ['SENNA']) + sep exe_file_2 = self.executable(self._path) if not path.isfile(exe_file_2): raise OSError("Senna executable expected at %s or %s but not found" % (exe_file_1,exe_file_2)) self.operations = operations def executable(self, base_path): """ The function that determines the system specific binary that should be used in the pipeline. In case, the system is not known the default senna binary will be used. """ os_name = system() if os_name == 'Linux': bits = architecture()[0] if bits == '64bit': return path.join(base_path, 'senna-linux64') return path.join(base_path, 'senna-linux32') if os_name == 'Windows': return path.join(base_path, 'senna-win32.exe') if os_name == 'Darwin': return path.join(base_path, 'senna-osx') return path.join(base_path, 'senna') def _map(self): """ A method that calculates the order of the columns that SENNA pipeline will output the tags into. This depends on the operations being ordered. """ _map = {} i = 1 for operation in Senna.SUPPORTED_OPERATIONS: if operation in self.operations: _map[operation] = i i+= 1 return _map def tag(self, tokens): """ Applies the specified operation(s) on a list of tokens. """ return self.tag_sents([tokens])[0] def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return a list of dictionaries. Every dictionary will contain a word with its calculated annotations/tags. """ encoding = self._encoding if not path.isfile(self.executable(self._path)): raise OSError("Senna executable expected at %s but not found" % self.executable(self._path)) # Build the senna command to run the tagger _senna_cmd = [self.executable(self._path), '-path', self._path, '-usrtokens', '-iobtags'] _senna_cmd.extend(['-'+op for op in self.operations]) # Serialize the actual sentences to a temporary string _input = '\n'.join((' '.join(x) for x in sentences))+'\n' if isinstance(_input, text_type) and encoding: _input = _input.encode(encoding) # Run the tagger and get the output p = Popen(_senna_cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE) (stdout, stderr) = p.communicate(input=_input) senna_output = stdout # Check the return code. if p.returncode != 0: raise RuntimeError('Senna command failed! Details: %s' % stderr) if encoding: senna_output = stdout.decode(encoding) # Output the tagged sentences map_ = self._map() tagged_sentences = [[]] sentence_index = 0 token_index = 0 for tagged_word in senna_output.strip().split("\n"): if not tagged_word: tagged_sentences.append([]) sentence_index += 1 token_index = 0 continue tags = tagged_word.split('\t') result = {} for tag in map_: result[tag] = tags[map_[tag]].strip() try: result['word'] = sentences[sentence_index][token_index] except IndexError: raise IndexError( "Misalignment error occurred at sentence number %d. Possible reason" " is that the sentence size exceeded the maximum size. Check the " "documentation of Senna class for more information." % sentence_index) tagged_sentences[-1].append(result) token_index += 1 return tagged_sentences # skip doctests if Senna is not installed def setup_module(module): from nose import SkipTest try: tagger = Senna('/usr/share/senna-v2.0', ['pos', 'chk', 'ner']) except OSError: raise SkipTest("Senna executable not found") nltk-3.1/nltk/classify/svm.py0000644000076500000240000000077412607224144016032 0ustar sbstaff00000000000000# Natural Language Toolkit: SVM-based classifier # # Copyright (C) 2001-2015 NLTK Project # Author: Leon Derczynski # # URL: # For license information, see LICENSE.TXT """ nltk.classify.svm was deprecated. For classification based on support vector machines SVMs use nltk.classify.scikitlearn (or `scikit-learn `_ directly). """ class SvmClassifier(object): def __init__(self, *args, **kwargs): raise NotImplementedError(__doc__) nltk-3.1/nltk/classify/tadm.py0000644000076500000240000000667312607224144016156 0ustar sbstaff00000000000000# Natural Language Toolkit: Interface to TADM Classifier # # Copyright (C) 2001-2015 NLTK Project # Author: Joseph Frazee # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals import sys import subprocess from nltk import compat from nltk.internals import find_binary try: import numpy except ImportError: pass _tadm_bin = None def config_tadm(bin=None): global _tadm_bin _tadm_bin = find_binary( 'tadm', bin, env_vars=['TADM'], binary_names=['tadm'], url='http://tadm.sf.net') def write_tadm_file(train_toks, encoding, stream): """ Generate an input file for ``tadm`` based on the given corpus of classified tokens. :type train_toks: list(tuple(dict, str)) :param train_toks: Training data, represented as a list of pairs, the first member of which is a feature dictionary, and the second of which is a classification label. :type encoding: TadmEventMaxentFeatureEncoding :param encoding: A feature encoding, used to convert featuresets into feature vectors. :type stream: stream :param stream: The stream to which the ``tadm`` input file should be written. """ # See the following for a file format description: # # http://sf.net/forum/forum.php?thread_id=1391502&forum_id=473054 # http://sf.net/forum/forum.php?thread_id=1675097&forum_id=473054 labels = encoding.labels() for featureset, label in train_toks: length_line = '%d\n' % len(labels) stream.write(length_line) for known_label in labels: v = encoding.encode(featureset, known_label) line = '%d %d %s\n' % ( int(label == known_label), len(v), ' '.join('%d %d' % u for u in v) ) stream.write(line) def parse_tadm_weights(paramfile): """ Given the stdout output generated by ``tadm`` when training a model, return a ``numpy`` array containing the corresponding weight vector. """ weights = [] for line in paramfile: weights.append(float(line.strip())) return numpy.array(weights, 'd') def call_tadm(args): """ Call the ``tadm`` binary with the given arguments. """ if isinstance(args, compat.string_types): raise TypeError('args should be a list of strings') if _tadm_bin is None: config_tadm() # Call tadm via a subprocess cmd = [_tadm_bin] + args p = subprocess.Popen(cmd, stdout=sys.stdout) (stdout, stderr) = p.communicate() # Check the return code. if p.returncode != 0: print() print(stderr) raise OSError('tadm command failed!') def names_demo(): from nltk.classify.util import names_demo from nltk.classify.maxent import TadmMaxentClassifier classifier = names_demo(TadmMaxentClassifier.train) def encoding_demo(): import sys from nltk.classify.maxent import TadmEventMaxentFeatureEncoding tokens = [({'f0':1, 'f1':1, 'f3':1}, 'A'), ({'f0':1, 'f2':1, 'f4':1}, 'B'), ({'f0':2, 'f2':1, 'f3':1, 'f4':1}, 'A')] encoding = TadmEventMaxentFeatureEncoding.train(tokens) write_tadm_file(tokens, encoding, sys.stdout) print() for i in range(encoding.length()): print('%s --> %d' % (encoding.describe(i), i)) print() if __name__ == '__main__': encoding_demo() names_demo() nltk-3.1/nltk/classify/textcat.py0000644000076500000240000001424012607224144016672 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Language ID module using TextCat algorithm # # Copyright (C) 2001-2015 NLTK Project # Author: Avital Pekker # # URL: # For license information, see LICENSE.TXT """ A module for language identification using the TextCat algorithm. An implementation of the text categorization algorithm presented in Cavnar, W. B. and J. M. Trenkle, "N-Gram-Based Text Categorization". The algorithm takes advantage of Zipf's law and uses n-gram frequencies to profile languages and text-yet to be identified-then compares using a distance measure. Language n-grams are provided by the "An Crubadan" project. A corpus reader was created seperately to read those files. For details regarding the algorithm, see: http://www.let.rug.nl/~vannoord/TextCat/textcat.pdf For details about An Crubadan, see: http://borel.slu.edu/crubadan/index.html """ # Ensure that literal strings default to unicode rather than str. from __future__ import print_function, unicode_literals from nltk.compat import PY3 from nltk.util import trigrams if PY3: from sys import maxsize else: from sys import maxint # Note: this is NOT "re" you're likely used to. The regex module # is an alternative to the standard re module that supports # Unicode codepoint properties with the \p{} syntax. # You may have to "pip install regx" try: import regex as re except ImportError: re = None ###################################################################### ## Language identification using TextCat ###################################################################### class TextCat(object): _corpus = None fingerprints = {} _START_CHAR = "<" _END_CHAR = ">" last_distances = {} def __init__(self): if not re: raise EnvironmentError("classify.textcat requires the regex module that " "supports unicode. Try '$ pip install regex' and " "see https://pypi.python.org/pypi/regex for " "further details.") from nltk.corpus import crubadan self._corpus = crubadan # Load all language ngrams into cache for lang in self._corpus.langs(): self._corpus.lang_freq(lang) def remove_punctuation(self, text): ''' Get rid of punctuation except apostrophes ''' return re.sub(r"[^\P{P}\']+", "", text) def profile(self, text): ''' Create FreqDist of trigrams within text ''' from nltk import word_tokenize, FreqDist clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = [''.join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint def calc_dist(self, lang, trigram, text_profile): ''' Calculate the "out-of-place" measure between the text and language profile for a single trigram ''' lang_fd = self._corpus.lang_freq(lang) dist = 0 if trigram in lang_fd: idx_lang_profile = list(lang_fd.keys()).index(trigram) idx_text = list(text_profile.keys()).index(trigram) #print(idx_lang_profile, ", ", idx_text) dist = abs(idx_lang_profile - idx_text) else: # Arbitrary but should be larger than # any possible trigram file length # in terms of total lines if PY3: dist = maxsize else: dist = maxint return dist def lang_dists(self, text): ''' Calculate the "out-of-place" measure between the text and all languages ''' distances = {} profile = self.profile(text) # For all the languages for lang in self._corpus._all_lang_freq.keys(): # Calculate distance metric for every trigram in # input text to be identified lang_dist = 0 for trigram in profile: lang_dist += self.calc_dist(lang, trigram, profile) distances[lang] = lang_dist return distances def guess_language(self, text): ''' Find the language with the min distance to the text and return its ISO 639-3 code ''' self.last_distances = self.lang_dists(text) return min(self.last_distances, key=self.last_distances.get) #################################################') def demo(): from nltk.corpus import udhr langs = ['Kurdish-UTF8', 'Abkhaz-UTF8', 'Farsi_Persian-UTF8', 'Hindi-UTF8', 'Hawaiian-UTF8', 'Russian-UTF8', 'Vietnamese-UTF8', 'Serbian_Srpski-UTF8','Esperanto-UTF8'] friendly = {'kmr':'Northern Kurdish', 'abk':'Abkhazian', 'pes':'Iranian Persian', 'hin':'Hindi', 'haw':'Hawaiian', 'rus':'Russian', 'vie':'Vietnamese', 'srp':'Serbian', 'epo':'Esperanto'} tc = TextCat() for cur_lang in langs: # Get raw data from UDHR corpus raw_sentences = udhr.sents(cur_lang) rows = len(raw_sentences) - 1 cols = list(map(len, raw_sentences)) sample = '' # Generate a sample text of the language for i in range(0, rows): cur_sent = '' for j in range(0, cols[i]): cur_sent += ' ' + raw_sentences[i][j] sample += cur_sent # Try to detect what it is print('Language snippet: ' + sample[0:140] + '...') guess = tc.guess_language(sample) print('Language detection: %s (%s)' % (guess, friendly[guess])) print('#' * 140) if __name__ == '__main__': demo() nltk-3.1/nltk/classify/util.py0000644000076500000240000002674512607224144016210 0ustar sbstaff00000000000000# Natural Language Toolkit: Classifier Utility Functions # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT """ Utility functions and classes for classifiers. """ from __future__ import print_function import math #from nltk.util import Deprecated import nltk.classify.util # for accuracy & log_likelihood from nltk.util import LazyMap ###################################################################### #{ Helper Functions ###################################################################### # alternative name possibility: 'map_featurefunc()'? # alternative name possibility: 'detect_features()'? # alternative name possibility: 'map_featuredetect()'? # or.. just have users use LazyMap directly? def apply_features(feature_func, toks, labeled=None): """ Use the ``LazyMap`` class to construct a lazy list-like object that is analogous to ``map(feature_func, toks)``. In particular, if ``labeled=False``, then the returned list-like object's values are equal to:: [feature_func(tok) for tok in toks] If ``labeled=True``, then the returned list-like object's values are equal to:: [(feature_func(tok), label) for (tok, label) in toks] The primary purpose of this function is to avoid the memory overhead involved in storing all the featuresets for every token in a corpus. Instead, these featuresets are constructed lazily, as-needed. The reduction in memory overhead can be especially significant when the underlying list of tokens is itself lazy (as is the case with many corpus readers). :param feature_func: The function that will be applied to each token. It should return a featureset -- i.e., a dict mapping feature names to feature values. :param toks: The list of tokens to which ``feature_func`` should be applied. If ``labeled=True``, then the list elements will be passed directly to ``feature_func()``. If ``labeled=False``, then the list elements should be tuples ``(tok,label)``, and ``tok`` will be passed to ``feature_func()``. :param labeled: If true, then ``toks`` contains labeled tokens -- i.e., tuples of the form ``(tok, label)``. (Default: auto-detect based on types.) """ if labeled is None: labeled = toks and isinstance(toks[0], (tuple, list)) if labeled: def lazy_func(labeled_token): return (feature_func(labeled_token[0]), labeled_token[1]) return LazyMap(lazy_func, toks) else: return LazyMap(feature_func, toks) def attested_labels(tokens): """ :return: A list of all labels that are attested in the given list of tokens. :rtype: list of (immutable) :param tokens: The list of classified tokens from which to extract labels. A classified token has the form ``(token, label)``. :type tokens: list """ return tuple(set(label for (tok, label) in tokens)) def log_likelihood(classifier, gold): results = classifier.prob_classify_many([fs for (fs, l) in gold]) ll = [pdist.prob(l) for ((fs, l), pdist) in zip(gold, results)] return math.log(float(sum(ll))/len(ll)) def accuracy(classifier, gold): results = classifier.classify_many([fs for (fs, l) in gold]) correct = [l == r for ((fs, l), r) in zip(gold, results)] if correct: return float(sum(correct))/len(correct) else: return 0 class CutoffChecker(object): """ A helper class that implements cutoff checks based on number of iterations and log likelihood. Accuracy cutoffs are also implemented, but they're almost never a good idea to use. """ def __init__(self, cutoffs): self.cutoffs = cutoffs.copy() if 'min_ll' in cutoffs: cutoffs['min_ll'] = -abs(cutoffs['min_ll']) if 'min_lldelta' in cutoffs: cutoffs['min_lldelta'] = abs(cutoffs['min_lldelta']) self.ll = None self.acc = None self.iter = 1 def check(self, classifier, train_toks): cutoffs = self.cutoffs self.iter += 1 if 'max_iter' in cutoffs and self.iter >= cutoffs['max_iter']: return True # iteration cutoff. new_ll = nltk.classify.util.log_likelihood(classifier, train_toks) if math.isnan(new_ll): return True if 'min_ll' in cutoffs or 'min_lldelta' in cutoffs: if 'min_ll' in cutoffs and new_ll >= cutoffs['min_ll']: return True # log likelihood cutoff if ('min_lldelta' in cutoffs and self.ll and ((new_ll - self.ll) <= abs(cutoffs['min_lldelta']))): return True # log likelihood delta cutoff self.ll = new_ll if 'max_acc' in cutoffs or 'min_accdelta' in cutoffs: new_acc = nltk.classify.util.log_likelihood( classifier, train_toks) if 'max_acc' in cutoffs and new_acc >= cutoffs['max_acc']: return True # log likelihood cutoff if ('min_accdelta' in cutoffs and self.acc and ((new_acc - self.acc) <= abs(cutoffs['min_accdelta']))): return True # log likelihood delta cutoff self.acc = new_acc return False # no cutoff reached. ###################################################################### #{ Demos ###################################################################### def names_demo_features(name): features = {} features['alwayson'] = True features['startswith'] = name[0].lower() features['endswith'] = name[-1].lower() for letter in 'abcdefghijklmnopqrstuvwxyz': features['count(%s)' % letter] = name.lower().count(letter) features['has(%s)' % letter] = letter in name.lower() return features def binary_names_demo_features(name): features = {} features['alwayson'] = True features['startswith(vowel)'] = name[0].lower() in 'aeiouy' features['endswith(vowel)'] = name[-1].lower() in 'aeiouy' for letter in 'abcdefghijklmnopqrstuvwxyz': features['count(%s)' % letter] = name.lower().count(letter) features['has(%s)' % letter] = letter in name.lower() features['startswith(%s)' % letter] = (letter == name[0].lower()) features['endswith(%s)' % letter] = (letter == name[-1].lower()) return features def names_demo(trainer, features=names_demo_features): from nltk.corpus import names import random # Construct a list of classified names, using the names corpus. namelist = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]) # Randomly split the names into a test & train set. random.seed(123456) random.shuffle(namelist) train = namelist[:5000] test = namelist[5000:5500] # Train up a classifier. print('Training classifier...') classifier = trainer( [(features(n), g) for (n, g) in train] ) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(n), g) for (n, g) in test]) print('Accuracy: %6.4f' % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, g) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test))) print() print('Unseen Names P(Male) P(Female)\n'+'-'*40) for ((name, gender), pdist) in list(zip(test, pdists))[:5]: if gender == 'male': fmt = ' %-15s *%6.4f %6.4f' else: fmt = ' %-15s %6.4f *%6.4f' print(fmt % (name, pdist.prob('male'), pdist.prob('female'))) except NotImplementedError: pass # Return the classifier return classifier def partial_names_demo(trainer, features=names_demo_features): from nltk.corpus import names import random male_names = names.words('male.txt') female_names = names.words('female.txt') random.seed(654321) random.shuffle(male_names) random.shuffle(female_names) # Create a list of male names to be used as positive-labeled examples for training positive = map(features, male_names[:2000]) # Create a list of male and female names to be used as unlabeled examples unlabeled = map(features, male_names[2000:2500] + female_names[:500]) # Create a test set with correctly-labeled male and female names test = [(name, True) for name in male_names[2500:2750]] \ + [(name, False) for name in female_names[500:750]] random.shuffle(test) # Train up a classifier. print('Training classifier...') classifier = trainer(positive, unlabeled) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(n), m) for (n, m) in test]) print('Accuracy: %6.4f' % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(n) for (n, m) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test))) print() print('Unseen Names P(Male) P(Female)\n'+'-'*40) for ((name, is_male), pdist) in zip(test, pdists)[:5]: if is_male == True: fmt = ' %-15s *%6.4f %6.4f' else: fmt = ' %-15s %6.4f *%6.4f' print(fmt % (name, pdist.prob(True), pdist.prob(False))) except NotImplementedError: pass # Return the classifier return classifier _inst_cache = {} def wsd_demo(trainer, word, features, n=1000): from nltk.corpus import senseval import random # Get the instances. print('Reading data...') global _inst_cache if word not in _inst_cache: _inst_cache[word] = [(i, i.senses[0]) for i in senseval.instances(word)] instances = _inst_cache[word][:] if n > len(instances): n = len(instances) senses = list(set(l for (i, l) in instances)) print(' Senses: ' + ' '.join(senses)) # Randomly split the names into a test & train set. print('Splitting into test & train...') random.seed(123456) random.shuffle(instances) train = instances[:int(.8*n)] test = instances[int(.8*n):n] # Train up a classifier. print('Training classifier...') classifier = trainer([(features(i), l) for (i, l) in train]) # Run the classifier on the test data. print('Testing classifier...') acc = accuracy(classifier, [(features(i), l) for (i, l) in test]) print('Accuracy: %6.4f' % acc) # For classifiers that can find probabilities, show the log # likelihood and some sample probability distributions. try: test_featuresets = [features(i) for (i, n) in test] pdists = classifier.prob_classify_many(test_featuresets) ll = [pdist.logprob(gold) for ((name, gold), pdist) in zip(test, pdists)] print('Avg. log likelihood: %6.4f' % (sum(ll)/len(test))) except NotImplementedError: pass # Return the classifier return classifier nltk-3.1/nltk/classify/weka.py0000644000076500000240000003052112607224144016145 0ustar sbstaff00000000000000# Natural Language Toolkit: Interface to Weka Classsifiers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Classifiers that make use of the external 'Weka' package. """ from __future__ import print_function import time import tempfile import os import subprocess import re import zipfile from sys import stdin from nltk import compat from nltk.probability import DictionaryProbDist from nltk.internals import java, config_java from nltk.classify.api import ClassifierI _weka_classpath = None _weka_search = ['.', '/usr/share/weka', '/usr/local/share/weka', '/usr/lib/weka', '/usr/local/lib/weka',] def config_weka(classpath=None): global _weka_classpath # Make sure java's configured first. config_java() if classpath is not None: _weka_classpath = classpath if _weka_classpath is None: searchpath = _weka_search if 'WEKAHOME' in os.environ: searchpath.insert(0, os.environ['WEKAHOME']) for path in searchpath: if os.path.exists(os.path.join(path, 'weka.jar')): _weka_classpath = os.path.join(path, 'weka.jar') version = _check_weka_version(_weka_classpath) if version: print(('[Found Weka: %s (version %s)]' % (_weka_classpath, version))) else: print('[Found Weka: %s]' % _weka_classpath) _check_weka_version(_weka_classpath) if _weka_classpath is None: raise LookupError('Unable to find weka.jar! Use config_weka() ' 'or set the WEKAHOME environment variable. ' 'For more information about Weka, please see ' 'http://www.cs.waikato.ac.nz/ml/weka/') def _check_weka_version(jar): try: zf = zipfile.ZipFile(jar) except SystemExit as KeyboardInterrupt: raise except: return None try: try: return zf.read('weka/core/version.txt') except KeyError: return None finally: zf.close() class WekaClassifier(ClassifierI): def __init__(self, formatter, model_filename): self._formatter = formatter self._model = model_filename def prob_classify_many(self, featuresets): return self._classify_many(featuresets, ['-p', '0', '-distribution']) def classify_many(self, featuresets): return self._classify_many(featuresets, ['-p', '0']) def _classify_many(self, featuresets, options): # Make sure we can find java & weka. config_weka() temp_dir = tempfile.mkdtemp() try: # Write the test data file. test_filename = os.path.join(temp_dir, 'test.arff') self._formatter.write(test_filename, featuresets) # Call weka to classify the data. cmd = ['weka.classifiers.bayes.NaiveBayes', '-l', self._model, '-T', test_filename] + options (stdout, stderr) = java(cmd, classpath=_weka_classpath, stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Check if something went wrong: if stderr and not stdout: if 'Illegal options: -distribution' in stderr: raise ValueError('The installed version of weka does ' 'not support probability distribution ' 'output.') else: raise ValueError('Weka failed to generate output:\n%s' % stderr) # Parse weka's output. return self.parse_weka_output(stdout.decode(stdin.encoding).split('\n')) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir) def parse_weka_distribution(self, s): probs = [float(v) for v in re.split('[*,]+', s) if v.strip()] probs = dict(zip(self._formatter.labels(), probs)) return DictionaryProbDist(probs) def parse_weka_output(self, lines): # Strip unwanted text from stdout for i,line in enumerate(lines): if line.strip().startswith("inst#"): lines = lines[i:] break if lines[0].split() == ['inst#', 'actual', 'predicted', 'error', 'prediction']: return [line.split()[2].split(':')[1] for line in lines[1:] if line.strip()] elif lines[0].split() == ['inst#', 'actual', 'predicted', 'error', 'distribution']: return [self.parse_weka_distribution(line.split()[-1]) for line in lines[1:] if line.strip()] # is this safe:? elif re.match(r'^0 \w+ [01]\.[0-9]* \?\s*$', lines[0]): return [line.split()[1] for line in lines if line.strip()] else: for line in lines[:10]: print(line) raise ValueError('Unhandled output format -- your version ' 'of weka may not be supported.\n' ' Header: %s' % lines[0]) # [xx] full list of classifiers (some may be abstract?): # ADTree, AODE, BayesNet, ComplementNaiveBayes, ConjunctiveRule, # DecisionStump, DecisionTable, HyperPipes, IB1, IBk, Id3, J48, # JRip, KStar, LBR, LeastMedSq, LinearRegression, LMT, Logistic, # LogisticBase, M5Base, MultilayerPerceptron, # MultipleClassifiersCombiner, NaiveBayes, NaiveBayesMultinomial, # NaiveBayesSimple, NBTree, NNge, OneR, PaceRegression, PART, # PreConstructedLinearModel, Prism, RandomForest, # RandomizableClassifier, RandomTree, RBFNetwork, REPTree, Ridor, # RuleNode, SimpleLinearRegression, SimpleLogistic, # SingleClassifierEnhancer, SMO, SMOreg, UserClassifier, VFI, # VotedPerceptron, Winnow, ZeroR _CLASSIFIER_CLASS = { 'naivebayes': 'weka.classifiers.bayes.NaiveBayes', 'C4.5': 'weka.classifiers.trees.J48', 'log_regression': 'weka.classifiers.functions.Logistic', 'svm': 'weka.classifiers.functions.SMO', 'kstar': 'weka.classifiers.lazy.KStar', 'ripper': 'weka.classifiers.rules.JRip', } @classmethod def train(cls, model_filename, featuresets, classifier='naivebayes', options=[], quiet=True): # Make sure we can find java & weka. config_weka() # Build an ARFF formatter. formatter = ARFF_Formatter.from_train(featuresets) temp_dir = tempfile.mkdtemp() try: # Write the training data file. train_filename = os.path.join(temp_dir, 'train.arff') formatter.write(train_filename, featuresets) if classifier in cls._CLASSIFIER_CLASS: javaclass = cls._CLASSIFIER_CLASS[classifier] elif classifier in cls._CLASSIFIER_CLASS.values(): javaclass = classifier else: raise ValueError('Unknown classifier %s' % classifier) # Train the weka model. cmd = [javaclass, '-d', model_filename, '-t', train_filename] cmd += list(options) if quiet: stdout = subprocess.PIPE else: stdout = None java(cmd, classpath=_weka_classpath, stdout=stdout) # Return the new classifier. return WekaClassifier(formatter, model_filename) finally: for f in os.listdir(temp_dir): os.remove(os.path.join(temp_dir, f)) os.rmdir(temp_dir) class ARFF_Formatter: """ Converts featuresets and labeled featuresets to ARFF-formatted strings, appropriate for input into Weka. Features and classes can be specified manually in the constructor, or may be determined from data using ``from_train``. """ def __init__(self, labels, features): """ :param labels: A list of all class labels that can be generated. :param features: A list of feature specifications, where each feature specification is a tuple (fname, ftype); and ftype is an ARFF type string such as NUMERIC or STRING. """ self._labels = labels self._features = features def format(self, tokens): """Returns a string representation of ARFF output for the given data.""" return self.header_section() + self.data_section(tokens) def labels(self): """Returns the list of classes.""" return list(self._labels) def write(self, outfile, tokens): """Writes ARFF data to a file for the given data.""" if not hasattr(outfile, 'write'): outfile = open(outfile, 'w') outfile.write(self.format(tokens)) outfile.close() @staticmethod def from_train(tokens): """ Constructs an ARFF_Formatter instance with class labels and feature types determined from the given data. Handles boolean, numeric and string (note: not nominal) types. """ # Find the set of all attested labels. labels = set(label for (tok, label) in tokens) # Determine the types of all features. features = {} for tok, label in tokens: for (fname, fval) in tok.items(): if issubclass(type(fval), bool): ftype = '{True, False}' elif issubclass(type(fval), (compat.integer_types, float, bool)): ftype = 'NUMERIC' elif issubclass(type(fval), compat.string_types): ftype = 'STRING' elif fval is None: continue # can't tell the type. else: raise ValueError('Unsupported value type %r' % ftype) if features.get(fname, ftype) != ftype: raise ValueError('Inconsistent type for %s' % fname) features[fname] = ftype features = sorted(features.items()) return ARFF_Formatter(labels, features) def header_section(self): """Returns an ARFF header as a string.""" # Header comment. s = ('% Weka ARFF file\n' + '% Generated automatically by NLTK\n' + '%% %s\n\n' % time.ctime()) # Relation name s += '@RELATION rel\n\n' # Input attribute specifications for fname, ftype in self._features: s += '@ATTRIBUTE %-30r %s\n' % (fname, ftype) # Label attribute specification s += '@ATTRIBUTE %-30r {%s}\n' % ('-label-', ','.join(self._labels)) return s def data_section(self, tokens, labeled=None): """ Returns the ARFF data section for the given data. :param tokens: a list of featuresets (dicts) or labelled featuresets which are tuples (featureset, label). :param labeled: Indicates whether the given tokens are labeled or not. If None, then the tokens will be assumed to be labeled if the first token's value is a tuple or list. """ # Check if the tokens are labeled or unlabeled. If unlabeled, # then use 'None' if labeled is None: labeled = tokens and isinstance(tokens[0], (tuple, list)) if not labeled: tokens = [(tok, None) for tok in tokens] # Data section s = '\n@DATA\n' for (tok, label) in tokens: for fname, ftype in self._features: s += '%s,' % self._fmt_arff_val(tok.get(fname)) s += '%s\n' % self._fmt_arff_val(label) return s def _fmt_arff_val(self, fval): if fval is None: return '?' elif isinstance(fval, (bool, compat.integer_types)): return '%s' % fval elif isinstance(fval, float): return '%r' % fval else: return '%r' % fval if __name__ == '__main__': from nltk.classify.util import names_demo, binary_names_demo_features def make_classifier(featuresets): return WekaClassifier.train('/tmp/name.model', featuresets, 'C4.5') classifier = names_demo(make_classifier, binary_names_demo_features) nltk-3.1/nltk/cluster/0000755000076500000240000000000012610001541014473 5ustar sbstaff00000000000000nltk-3.1/nltk/cluster/__init__.py0000644000076500000240000001027312607224144016623 0ustar sbstaff00000000000000# Natural Language Toolkit: Clusterers # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # URL: # For license information, see LICENSE.TXT """ This module contains a number of basic clustering algorithms. Clustering describes the task of discovering groups of similar items with a large collection. It is also describe as unsupervised machine learning, as the data from which it learns is unannotated with class information, as is the case for supervised learning. Annotated data is difficult and expensive to obtain in the quantities required for the majority of supervised learning algorithms. This problem, the knowledge acquisition bottleneck, is common to most natural language processing tasks, thus fueling the need for quality unsupervised approaches. This module contains a k-means clusterer, E-M clusterer and a group average agglomerative clusterer (GAAC). All these clusterers involve finding good cluster groupings for a set of vectors in multi-dimensional space. The K-means clusterer starts with k arbitrary chosen means then allocates each vector to the cluster with the closest mean. It then recalculates the means of each cluster as the centroid of the vectors in the cluster. This process repeats until the cluster memberships stabilise. This is a hill-climbing algorithm which may converge to a local maximum. Hence the clustering is often repeated with random initial means and the most commonly occurring output means are chosen. The GAAC clusterer starts with each of the *N* vectors as singleton clusters. It then iteratively merges pairs of clusters which have the closest centroids. This continues until there is only one cluster. The order of merges gives rise to a dendrogram - a tree with the earlier merges lower than later merges. The membership of a given number of clusters *c*, *1 <= c <= N*, can be found by cutting the dendrogram at depth *c*. The Gaussian EM clusterer models the vectors as being produced by a mixture of k Gaussian sources. The parameters of these sources (prior probability, mean and covariance matrix) are then found to maximise the likelihood of the given data. This is done with the expectation maximisation algorithm. It starts with k arbitrarily chosen means, priors and covariance matrices. It then calculates the membership probabilities for each vector in each of the clusters - this is the 'E' step. The cluster parameters are then updated in the 'M' step using the maximum likelihood estimate from the cluster membership probabilities. This process continues until the likelihood of the data does not significantly increase. They all extend the ClusterI interface which defines common operations available with each clusterer. These operations include. - cluster: clusters a sequence of vectors - classify: assign a vector to a cluster - classification_probdist: give the probability distribution over cluster memberships The current existing classifiers also extend cluster.VectorSpace, an abstract class which allows for singular value decomposition (SVD) and vector normalisation. SVD is used to reduce the dimensionality of the vector space in such a manner as to preserve as much of the variation as possible, by reparameterising the axes in order of variability and discarding all bar the first d dimensions. Normalisation ensures that vectors fall in the unit hypersphere. Usage example (see also demo()):: from nltk import cluster from nltk.cluster import euclidean_distance from numpy import array vectors = [array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0]]] # initialise the clusterer (will also assign the vectors to clusters) clusterer = cluster.KMeansClusterer(2, euclidean_distance) clusterer.cluster(vectors, True) # classify a new vector print(clusterer.classify(array([3, 3]))) Note that the vectors must use numpy array-like objects. nltk_contrib.unimelb.tacohn.SparseArrays may be used for efficiency when required. """ from nltk.cluster.util import (VectorSpaceClusterer, Dendrogram, euclidean_distance, cosine_distance) from nltk.cluster.kmeans import KMeansClusterer from nltk.cluster.gaac import GAAClusterer from nltk.cluster.em import EMClusterer nltk-3.1/nltk/cluster/api.py0000644000076500000240000000401112607224144015626 0ustar sbstaff00000000000000# Natural Language Toolkit: Clusterer Interfaces # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # Porting: Steven Bird # URL: # For license information, see LICENSE.TXT from nltk.probability import DictionaryProbDist class ClusterI(object): """ Interface covering basic clustering functionality. """ def cluster(self, vectors, assign_clusters=False): """ Assigns the vectors to clusters, learning the clustering parameters from the data. Returns a cluster identifier for each vector. """ raise NotImplementedError() def classify(self, token): """ Classifies the token into a cluster, setting the token's CLUSTER parameter to that cluster identifier. """ raise NotImplementedError() def likelihood(self, vector, label): """ Returns the likelihood (a float) of the token having the corresponding cluster. """ if self.classify(vector) == label: return 1.0 else: return 0.0 def classification_probdist(self, vector): """ Classifies the token into a cluster, returning a probability distribution over the cluster identifiers. """ likelihoods = {} sum = 0.0 for cluster in self.cluster_names(): likelihoods[cluster] = self.likelihood(vector, cluster) sum += likelihoods[cluster] for cluster in self.cluster_names(): likelihoods[cluster] /= sum return DictionaryProbDist(likelihoods) def num_clusters(self): """ Returns the number of clusters. """ raise NotImplementedError() def cluster_names(self): """ Returns the names of the clusters. """ return list(range(self.num_clusters())) def cluster_name(self, index): """ Returns the names of the cluster at index. """ return index nltk-3.1/nltk/cluster/em.py0000644000076500000240000002271312607224144015467 0ustar sbstaff00000000000000# Natural Language Toolkit: Expectation Maximization Clusterer # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals try: import numpy except ImportError: pass from nltk.compat import python_2_unicode_compatible from nltk.cluster.util import VectorSpaceClusterer @python_2_unicode_compatible class EMClusterer(VectorSpaceClusterer): """ The Gaussian EM clusterer models the vectors as being produced by a mixture of k Gaussian sources. The parameters of these sources (prior probability, mean and covariance matrix) are then found to maximise the likelihood of the given data. This is done with the expectation maximisation algorithm. It starts with k arbitrarily chosen means, priors and covariance matrices. It then calculates the membership probabilities for each vector in each of the clusters; this is the 'E' step. The cluster parameters are then updated in the 'M' step using the maximum likelihood estimate from the cluster membership probabilities. This process continues until the likelihood of the data does not significantly increase. """ def __init__(self, initial_means, priors=None, covariance_matrices=None, conv_threshold=1e-6, bias=0.1, normalise=False, svd_dimensions=None): """ Creates an EM clusterer with the given starting parameters, convergence threshold and vector mangling parameters. :param initial_means: the means of the gaussian cluster centers :type initial_means: [seq of] numpy array or seq of SparseArray :param priors: the prior probability for each cluster :type priors: numpy array or seq of float :param covariance_matrices: the covariance matrix for each cluster :type covariance_matrices: [seq of] numpy array :param conv_threshold: maximum change in likelihood before deemed convergent :type conv_threshold: int or float :param bias: variance bias used to ensure non-singular covariance matrices :type bias: float :param normalise: should vectors be normalised to length 1 :type normalise: boolean :param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD :type svd_dimensions: int """ VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._means = numpy.array(initial_means, numpy.float64) self._num_clusters = len(initial_means) self._conv_threshold = conv_threshold self._covariance_matrices = covariance_matrices self._priors = priors self._bias = bias def num_clusters(self): return self._num_clusters def cluster_vectorspace(self, vectors, trace=False): assert len(vectors) > 0 # set the parameters to initial values dimensions = len(vectors[0]) means = self._means priors = self._priors if not priors: priors = self._priors = numpy.ones(self._num_clusters, numpy.float64) / self._num_clusters covariances = self._covariance_matrices if not covariances: covariances = self._covariance_matrices = \ [ numpy.identity(dimensions, numpy.float64) for i in range(self._num_clusters) ] # do the E and M steps until the likelihood plateaus lastl = self._loglikelihood(vectors, priors, means, covariances) converged = False while not converged: if trace: print('iteration; loglikelihood', lastl) # E-step, calculate hidden variables, h[i,j] h = numpy.zeros((len(vectors), self._num_clusters), numpy.float64) for i in range(len(vectors)): for j in range(self._num_clusters): h[i,j] = priors[j] * self._gaussian(means[j], covariances[j], vectors[i]) h[i,:] /= sum(h[i,:]) # M-step, update parameters - cvm, p, mean for j in range(self._num_clusters): covariance_before = covariances[j] new_covariance = numpy.zeros((dimensions, dimensions), numpy.float64) new_mean = numpy.zeros(dimensions, numpy.float64) sum_hj = 0.0 for i in range(len(vectors)): delta = vectors[i] - means[j] new_covariance += h[i,j] * \ numpy.multiply.outer(delta, delta) sum_hj += h[i,j] new_mean += h[i,j] * vectors[i] covariances[j] = new_covariance / sum_hj means[j] = new_mean / sum_hj priors[j] = sum_hj / len(vectors) # bias term to stop covariance matrix being singular covariances[j] += self._bias * \ numpy.identity(dimensions, numpy.float64) # calculate likelihood - FIXME: may be broken l = self._loglikelihood(vectors, priors, means, covariances) # check for convergence if abs(lastl - l) < self._conv_threshold: converged = True lastl = l def classify_vectorspace(self, vector): best = None for j in range(self._num_clusters): p = self._priors[j] * self._gaussian(self._means[j], self._covariance_matrices[j], vector) if not best or p > best[0]: best = (p, j) return best[1] def likelihood_vectorspace(self, vector, cluster): cid = self.cluster_names().index(cluster) return self._priors[cluster] * self._gaussian(self._means[cluster], self._covariance_matrices[cluster], vector) def _gaussian(self, mean, cvm, x): m = len(mean) assert cvm.shape == (m, m), \ 'bad sized covariance matrix, %s' % str(cvm.shape) try: det = numpy.linalg.det(cvm) inv = numpy.linalg.inv(cvm) a = det ** -0.5 * (2 * numpy.pi) ** (-m / 2.0) dx = x - mean print(dx, inv) b = -0.5 * numpy.dot( numpy.dot(dx, inv), dx) return a * numpy.exp(b) except OverflowError: # happens when the exponent is negative infinity - i.e. b = 0 # i.e. the inverse of cvm is huge (cvm is almost zero) return 0 def _loglikelihood(self, vectors, priors, means, covariances): llh = 0.0 for vector in vectors: p = 0 for j in range(len(priors)): p += priors[j] * \ self._gaussian(means[j], covariances[j], vector) llh += numpy.log(p) return llh def __repr__(self): return '' % list(self._means) def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk import cluster # example from figure 14.10, page 519, Manning and Schutze vectors = [numpy.array(f) for f in [[0.5, 0.5], [1.5, 0.5], [1, 3]]] means = [[4, 2], [4, 2.01]] clusterer = cluster.EMClusterer(means, bias=0.1) clusters = clusterer.cluster(vectors, True, trace=True) print('Clustered:', vectors) print('As: ', clusters) print() for c in range(2): print('Cluster:', c) print('Prior: ', clusterer._priors[c]) print('Mean: ', clusterer._means[c]) print('Covar: ', clusterer._covariance_matrices[c]) print() # classify a new vector vector = numpy.array([2, 2]) print('classify(%s):' % vector, end=' ') print(clusterer.classify(vector)) # show the classification probabilities vector = numpy.array([2, 2]) print('classification_probdist(%s):' % vector) pdist = clusterer.classification_probdist(vector) for sample in pdist.samples(): print('%s => %.0f%%' % (sample, pdist.prob(sample) *100)) # # The following demo code is broken. # # # use a set of tokens with 2D indices # vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # # test the EM clusterer with means given by k-means (2) and # # dimensionality reduction # clusterer = cluster.KMeans(2, euclidean_distance, svd_dimensions=1) # print 'Clusterer:', clusterer # clusters = clusterer.cluster(vectors) # means = clusterer.means() # print 'Means:', clusterer.means() # print # clusterer = cluster.EMClusterer(means, svd_dimensions=1) # clusters = clusterer.cluster(vectors, True) # print 'Clusterer:', clusterer # print 'Clustered:', str(vectors)[:60], '...' # print 'As:', str(clusters)[:60], '...' # print # # classify a new vector # vector = numpy.array([3, 3]) # print 'classify(%s):' % vector, # print clusterer.classify(vector) # print # # show the classification probabilities # vector = numpy.array([2.2, 2]) # print 'classification_probdist(%s)' % vector # pdist = clusterer.classification_probdist(vector) # for sample in pdist: # print '%s => %.0f%%' % (sample, pdist.prob(sample) *100) if __name__ == '__main__': demo() nltk-3.1/nltk/cluster/gaac.py0000644000076500000240000001326412607224144015762 0ustar sbstaff00000000000000# Natural Language Toolkit: Group Average Agglomerative Clusterer # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals try: import numpy except ImportError: pass from nltk.cluster.util import VectorSpaceClusterer, Dendrogram, cosine_distance from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class GAAClusterer(VectorSpaceClusterer): """ The Group Average Agglomerative starts with each of the N vectors as singleton clusters. It then iteratively merges pairs of clusters which have the closest centroids. This continues until there is only one cluster. The order of merges gives rise to a dendrogram: a tree with the earlier merges lower than later merges. The membership of a given number of clusters c, 1 <= c <= N, can be found by cutting the dendrogram at depth c. This clusterer uses the cosine similarity metric only, which allows for efficient speed-up in the clustering process. """ def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None): VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_clusters = num_clusters self._dendrogram = None self._groups_values = None def cluster(self, vectors, assign_clusters=False, trace=False): # stores the merge order self._dendrogram = Dendrogram( [numpy.array(vector, numpy.float64) for vector in vectors]) return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace) def cluster_vectorspace(self, vectors, trace=False): # variables describing the initial situation N = len(vectors) cluster_len = [1]*N cluster_count = N index_map = numpy.arange(N) # construct the similarity matrix dims = (N, N) dist = numpy.ones(dims, dtype=numpy.float)*numpy.inf for i in range(N): for j in range(i+1, N): dist[i, j] = cosine_distance(vectors[i], vectors[j]) while cluster_count > max(self._num_clusters, 1): i, j = numpy.unravel_index(dist.argmin(), dims) if trace: print("merging %d and %d" % (i, j)) # update similarities for merging i and j self._merge_similarities(dist, cluster_len, i, j) # remove j dist[:, j] = numpy.inf dist[j, :] = numpy.inf # merge the clusters cluster_len[i] = cluster_len[i]+cluster_len[j] self._dendrogram.merge(index_map[i], index_map[j]) cluster_count -= 1 # update the index map to reflect the indexes if we # had removed j index_map[j+1:] -= 1 index_map[j] = N self.update_clusters(self._num_clusters) def _merge_similarities(self, dist, cluster_len, i, j): # the new cluster i merged from i and j adopts the average of # i and j's similarity to each other cluster, weighted by the # number of points in the clusters i and j i_weight = cluster_len[i] j_weight = cluster_len[j] weight_sum = i_weight+j_weight # update for x 0 if self._should_normalise: centroid = self._normalise(cluster[0]) else: centroid = numpy.array(cluster[0]) for vector in cluster[1:]: if self._should_normalise: centroid += self._normalise(vector) else: centroid += vector centroid /= float(len(cluster)) self._centroids.append(centroid) self._num_clusters = len(self._centroids) def classify_vectorspace(self, vector): best = None for i in range(self._num_clusters): centroid = self._centroids[i] dist = cosine_distance(vector, centroid) if not best or dist < best[0]: best = (dist, i) return best[1] def dendrogram(self): """ :return: The dendrogram representing the current clustering :rtype: Dendrogram """ return self._dendrogram def num_clusters(self): return self._num_clusters def __repr__(self): return '' % self._num_clusters def demo(): """ Non-interactive demonstration of the clusterers with simple 2-D data. """ from nltk.cluster import GAAClusterer # use a set of tokens with 2D indices vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test the GAAC clusterer with 4 clusters clusterer = GAAClusterer(4) clusters = clusterer.cluster(vectors, True) print('Clusterer:', clusterer) print('Clustered:', vectors) print('As:', clusters) print() # show the dendrogram clusterer.dendrogram().show() # classify a new vector vector = numpy.array([3, 3]) print('classify(%s):' % vector, end=' ') print(clusterer.classify(vector)) print() if __name__ == '__main__': demo() nltk-3.1/nltk/cluster/kmeans.py0000644000076500000240000002036712607224144016347 0ustar sbstaff00000000000000# Natural Language Toolkit: K-Means Clusterer # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals import copy import random import sys try: import numpy except ImportError: pass from nltk.cluster.util import VectorSpaceClusterer from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class KMeansClusterer(VectorSpaceClusterer): """ The K-means clusterer starts with k arbitrary chosen means then allocates each vector to the cluster with the closest mean. It then recalculates the means of each cluster as the centroid of the vectors in the cluster. This process repeats until the cluster memberships stabilise. This is a hill-climbing algorithm which may converge to a local maximum. Hence the clustering is often repeated with random initial means and the most commonly occurring output means are chosen. """ def __init__(self, num_means, distance, repeats=1, conv_test=1e-6, initial_means=None, normalise=False, svd_dimensions=None, rng=None, avoid_empty_clusters=False): """ :param num_means: the number of means to use (may use fewer) :type num_means: int :param distance: measure of distance between two vectors :type distance: function taking two vectors and returing a float :param repeats: number of randomised clustering trials to use :type repeats: int :param conv_test: maximum variation in mean differences before deemed convergent :type conv_test: number :param initial_means: set of k initial means :type initial_means: sequence of vectors :param normalise: should vectors be normalised to length 1 :type normalise: boolean :param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD :type svd_dimensions: int :param rng: random number generator (or None) :type rng: Random :param avoid_empty_clusters: include current centroid in computation of next one; avoids undefined behavior when clusters become empty :type avoid_empty_clusters: boolean """ VectorSpaceClusterer.__init__(self, normalise, svd_dimensions) self._num_means = num_means self._distance = distance self._max_difference = conv_test assert not initial_means or len(initial_means) == num_means self._means = initial_means assert repeats >= 1 assert not (initial_means and repeats > 1) self._repeats = repeats self._rng = (rng if rng else random.Random()) self._avoid_empty_clusters = avoid_empty_clusters def cluster_vectorspace(self, vectors, trace=False): if self._means and self._repeats > 1: print('Warning: means will be discarded for subsequent trials') meanss = [] for trial in range(self._repeats): if trace: print('k-means trial', trial) if not self._means or trial > 1: self._means = self._rng.sample(list(vectors), self._num_means) self._cluster_vectorspace(vectors, trace) meanss.append(self._means) if len(meanss) > 1: # sort the means first (so that different cluster numbering won't # effect the distance comparison) for means in meanss: means.sort(key=sum) # find the set of means that's minimally different from the others min_difference = min_means = None for i in range(len(meanss)): d = 0 for j in range(len(meanss)): if i != j: d += self._sum_distances(meanss[i], meanss[j]) if min_difference is None or d < min_difference: min_difference, min_means = d, meanss[i] # use the best means self._means = min_means def _cluster_vectorspace(self, vectors, trace=False): if self._num_means < len(vectors): # perform k-means clustering converged = False while not converged: # assign the tokens to clusters based on minimum distance to # the cluster means clusters = [[] for m in range(self._num_means)] for vector in vectors: index = self.classify_vectorspace(vector) clusters[index].append(vector) if trace: print('iteration') #for i in range(self._num_means): #print ' mean', i, 'allocated', len(clusters[i]), 'vectors' # recalculate cluster means by computing the centroid of each cluster new_means = list(map(self._centroid, clusters, self._means)) # measure the degree of change from the previous step for convergence difference = self._sum_distances(self._means, new_means) if difference < self._max_difference: converged = True # remember the new means self._means = new_means def classify_vectorspace(self, vector): # finds the closest cluster centroid # returns that cluster's index best_distance = best_index = None for index in range(len(self._means)): mean = self._means[index] dist = self._distance(vector, mean) if best_distance is None or dist < best_distance: best_index, best_distance = index, dist return best_index def num_clusters(self): if self._means: return len(self._means) else: return self._num_means def means(self): """ The means used for clustering. """ return self._means def _sum_distances(self, vectors1, vectors2): difference = 0.0 for u, v in zip(vectors1, vectors2): difference += self._distance(u, v) return difference def _centroid(self, cluster, mean): if self._avoid_empty_clusters: centroid = copy.copy(mean) for vector in cluster: centroid += vector return centroid / (1+float(len(cluster))) else: if not len(cluster): sys.stderr.write('Error: no centroid defined for empty cluster.\n') sys.stderr.write('Try setting argument \'avoid_empty_clusters\' to True\n') assert(False) centroid = copy.copy(cluster[0]) for vector in cluster[1:]: centroid += vector return centroid / float(len(cluster)) def __repr__(self): return '' % \ (self._means, self._repeats) ################################################################################# def demo(): # example from figure 14.9, page 517, Manning and Schutze from nltk.cluster import KMeansClusterer, euclidean_distance vectors = [numpy.array(f) for f in [[2, 1], [1, 3], [4, 7], [6, 7]]] means = [[4, 3], [5, 5]] clusterer = KMeansClusterer(2, euclidean_distance, initial_means=means) clusters = clusterer.cluster(vectors, True, trace=True) print('Clustered:', vectors) print('As:', clusters) print('Means:', clusterer.means()) print() vectors = [numpy.array(f) for f in [[3, 3], [1, 2], [4, 2], [4, 0], [2, 3], [3, 1]]] # test k-means using the euclidean distance metric, 2 means and repeat # clustering 10 times with random seeds clusterer = KMeansClusterer(2, euclidean_distance, repeats=10) clusters = clusterer.cluster(vectors, True) print('Clustered:', vectors) print('As:', clusters) print('Means:', clusterer.means()) print() # classify a new vector vector = numpy.array([3, 3]) print('classify(%s):' % vector, end=' ') print(clusterer.classify(vector)) print() if __name__ == '__main__': demo() nltk-3.1/nltk/cluster/util.py0000644000076500000240000002266412607224144016050 0ustar sbstaff00000000000000# Natural Language Toolkit: Clusterer Utilities # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals import copy from sys import stdout from math import sqrt try: import numpy except ImportError: pass from nltk.cluster.api import ClusterI from nltk.compat import python_2_unicode_compatible class VectorSpaceClusterer(ClusterI): """ Abstract clusterer which takes tokens and maps them into a vector space. Optionally performs singular value decomposition to reduce the dimensionality. """ def __init__(self, normalise=False, svd_dimensions=None): """ :param normalise: should vectors be normalised to length 1 :type normalise: boolean :param svd_dimensions: number of dimensions to use in reducing vector dimensionsionality with SVD :type svd_dimensions: int """ self._Tt = None self._should_normalise = normalise self._svd_dimensions = svd_dimensions def cluster(self, vectors, assign_clusters=False, trace=False): assert len(vectors) > 0 # normalise the vectors if self._should_normalise: vectors = list(map(self._normalise, vectors)) # use SVD to reduce the dimensionality if self._svd_dimensions and self._svd_dimensions < len(vectors[0]): [u, d, vt] = numpy.linalg.svd(numpy.transpose(numpy.array(vectors))) S = d[:self._svd_dimensions] * \ numpy.identity(self._svd_dimensions, numpy.float64) T = u[:,:self._svd_dimensions] Dt = vt[:self._svd_dimensions,:] vectors = numpy.transpose(numpy.dot(S, Dt)) self._Tt = numpy.transpose(T) # call abstract method to cluster the vectors self.cluster_vectorspace(vectors, trace) # assign the vectors to clusters if assign_clusters: return [self.classify(vector) for vector in vectors] def cluster_vectorspace(self, vectors, trace): """ Finds the clusters using the given set of vectors. """ raise NotImplementedError() def classify(self, vector): if self._should_normalise: vector = self._normalise(vector) if self._Tt is not None: vector = numpy.dot(self._Tt, vector) cluster = self.classify_vectorspace(vector) return self.cluster_name(cluster) def classify_vectorspace(self, vector): """ Returns the index of the appropriate cluster for the vector. """ raise NotImplementedError() def likelihood(self, vector, label): if self._should_normalise: vector = self._normalise(vector) if self._Tt is not None: vector = numpy.dot(self._Tt, vector) return self.likelihood_vectorspace(vector, label) def likelihood_vectorspace(self, vector, cluster): """ Returns the likelihood of the vector belonging to the cluster. """ predicted = self.classify_vectorspace(vector) return (1.0 if cluster == predicted else 0.0) def vector(self, vector): """ Returns the vector after normalisation and dimensionality reduction """ if self._should_normalise: vector = self._normalise(vector) if self._Tt is not None: vector = numpy.dot(self._Tt, vector) return vector def _normalise(self, vector): """ Normalises the vector to unit length. """ return vector / sqrt(numpy.dot(vector, vector)) def euclidean_distance(u, v): """ Returns the euclidean distance between vectors u and v. This is equivalent to the length of the vector (u - v). """ diff = u - v return sqrt(numpy.dot(diff, diff)) def cosine_distance(u, v): """ Returns 1 minus the cosine of the angle between vectors v and u. This is equal to 1 - (u.v / |u||v|). """ return 1 - (numpy.dot(u, v) / (sqrt(numpy.dot(u, u)) * sqrt(numpy.dot(v, v)))) class _DendrogramNode(object): """ Tree node of a dendrogram. """ def __init__(self, value, *children): self._value = value self._children = children def leaves(self, values=True): if self._children: leaves = [] for child in self._children: leaves.extend(child.leaves(values)) return leaves elif values: return [self._value] else: return [self] def groups(self, n): queue = [(self._value, self)] while len(queue) < n: priority, node = queue.pop() if not node._children: queue.push((priority, node)) break for child in node._children: if child._children: queue.append((child._value, child)) else: queue.append((0, child)) # makes the earliest merges at the start, latest at the end queue.sort() groups = [] for priority, node in queue: groups.append(node.leaves()) return groups @python_2_unicode_compatible class Dendrogram(object): """ Represents a dendrogram, a tree with a specified branching order. This must be initialised with the leaf items, then iteratively call merge for each branch. This class constructs a tree representing the order of calls to the merge function. """ def __init__(self, items=[]): """ :param items: the items at the leaves of the dendrogram :type items: sequence of (any) """ self._items = [_DendrogramNode(item) for item in items] self._original_items = copy.copy(self._items) self._merge = 1 def merge(self, *indices): """ Merges nodes at given indices in the dendrogram. The nodes will be combined which then replaces the first node specified. All other nodes involved in the merge will be removed. :param indices: indices of the items to merge (at least two) :type indices: seq of int """ assert len(indices) >= 2 node = _DendrogramNode(self._merge, *[self._items[i] for i in indices]) self._merge += 1 self._items[indices[0]] = node for i in indices[1:]: del self._items[i] def groups(self, n): """ Finds the n-groups of items (leaves) reachable from a cut at depth n. :param n: number of groups :type n: int """ if len(self._items) > 1: root = _DendrogramNode(self._merge, *self._items) else: root = self._items[0] return root.groups(n) def show(self, leaf_labels=[]): """ Print the dendrogram in ASCII art to standard out. :param leaf_labels: an optional list of strings to use for labeling the leaves :type leaf_labels: list """ # ASCII rendering characters JOIN, HLINK, VLINK = '+', '-', '|' # find the root (or create one) if len(self._items) > 1: root = _DendrogramNode(self._merge, *self._items) else: root = self._items[0] leaves = self._original_items if leaf_labels: last_row = leaf_labels else: last_row = ["%s" % leaf._value for leaf in leaves] # find the bottom row and the best cell width width = max(map(len, last_row)) + 1 lhalf = width / 2 rhalf = width - lhalf - 1 # display functions def format(centre, left=' ', right=' '): return '%s%s%s' % (lhalf*left, centre, right*rhalf) def display(str): stdout.write(str) # for each merge, top down queue = [(root._value, root)] verticals = [ format(' ') for leaf in leaves ] while queue: priority, node = queue.pop() child_left_leaf = list(map(lambda c: c.leaves(False)[0], node._children)) indices = list(map(leaves.index, child_left_leaf)) if child_left_leaf: min_idx = min(indices) max_idx = max(indices) for i in range(len(leaves)): if leaves[i] in child_left_leaf: if i == min_idx: display(format(JOIN, ' ', HLINK)) elif i == max_idx: display(format(JOIN, HLINK, ' ')) else: display(format(JOIN, HLINK, HLINK)) verticals[i] = format(VLINK) elif min_idx <= i <= max_idx: display(format(HLINK, HLINK, HLINK)) else: display(verticals[i]) display('\n') for child in node._children: if child._children: queue.append((child._value, child)) queue.sort() for vertical in verticals: display(vertical) display('\n') # finally, display the last line display(''.join(item.center(width) for item in last_row)) display('\n') def __repr__(self): if len(self._items) > 1: root = _DendrogramNode(self._merge, *self._items) else: root = self._items[0] leaves = root.leaves(False) return '' % len(leaves) nltk-3.1/nltk/collocations.py0000644000076500000240000003424012607224144016074 0ustar sbstaff00000000000000# Natural Language Toolkit: Collocations and Association Measures # # Copyright (C) 2001-2015 NLTK Project # Author: Joel Nothman # URL: # For license information, see LICENSE.TXT # """ Tools to identify collocations --- words that often appear consecutively --- within corpora. They may also be used to find other associations between word occurrences. See Manning and Schutze ch. 5 at http://nlp.stanford.edu/fsnlp/promo/colloc.pdf and the Text::NSP Perl package at http://ngram.sourceforge.net Finding collocations requires first calculating the frequencies of words and their appearance in the context of other words. Often the collection of words will then requiring filtering to only retain useful content terms. Each ngram of words may then be scored according to some association measure, in order to determine the relative likelihood of each ngram being a collocation. The ``BigramCollocationFinder`` and ``TrigramCollocationFinder`` classes provide these functionalities, dependent on being provided a function which scores a ngram given appropriate frequency counts. A number of standard association measures are provided in bigram_measures and trigram_measures. """ from __future__ import print_function # Possible TODOs: # - consider the distinction between f(x,_) and f(x) and whether our # approximation is good enough for fragmented data, and mention it # - add a n-gram collocation finder with measures which only utilise n-gram # and unigram counts (raw_freq, pmi, student_t) import itertools as _itertools from nltk.compat import iteritems from nltk.probability import FreqDist from nltk.util import ngrams from nltk.metrics import ContingencyMeasures, BigramAssocMeasures, TrigramAssocMeasures from nltk.metrics.spearman import ranks_from_scores, spearman_correlation class AbstractCollocationFinder(object): """ An abstract base class for collocation finders whose purpose is to collect collocation candidate frequencies, filter and rank them. As a minimum, collocation finders require the frequencies of each word in a corpus, and the joint frequency of word tuples. This data should be provided through nltk.probability.FreqDist objects or an identical interface. """ def __init__(self, word_fd, ngram_fd): self.word_fd = word_fd self.N = word_fd.N() self.ngram_fd = ngram_fd @classmethod def _build_new_documents(cls, documents, window_size, pad_left=False, pad_right=False, pad_symbol=None): ''' Pad the document with the place holder according to the window_size ''' padding = (pad_symbol,) * (window_size - 1) if pad_right: return _itertools.chain.from_iterable(_itertools.chain(doc, padding) for doc in documents) if pad_left: return _itertools.chain.from_iterable(_itertools.chain(padding, doc) for doc in documents) @classmethod def from_documents(cls, documents): """Constructs a collocation finder given a collection of documents, each of which is a list (or iterable) of tokens. """ #return cls.from_words(_itertools.chain(*documents)) return cls.from_words(cls._build_new_documents(documents, cls.default_ws, pad_right=True)) @staticmethod def _ngram_freqdist(words, n): return FreqDist(tuple(words[i:i + n]) for i in range(len(words) - 1)) def _apply_filter(self, fn=lambda ngram, freq: False): """Generic filter removes ngrams from the frequency distribution if the function returns True when passed an ngram tuple. """ tmp_ngram = FreqDist() for ngram, freq in iteritems(self.ngram_fd): if not fn(ngram, freq): tmp_ngram[ngram] = freq self.ngram_fd = tmp_ngram def apply_freq_filter(self, min_freq): """Removes candidate ngrams which have frequency less than min_freq.""" self._apply_filter(lambda ng, freq: freq < min_freq) def apply_ngram_filter(self, fn): """Removes candidate ngrams (w1, w2, ...) where fn(w1, w2, ...) evaluates to True. """ self._apply_filter(lambda ng, f: fn(*ng)) def apply_word_filter(self, fn): """Removes candidate ngrams (w1, w2, ...) where any of (fn(w1), fn(w2), ...) evaluates to True. """ self._apply_filter(lambda ng, f: any(fn(w) for w in ng)) def _score_ngrams(self, score_fn): """Generates of (ngram, score) pairs as determined by the scoring function provided. """ for tup in self.ngram_fd: score = self.score_ngram(score_fn, *tup) if score is not None: yield tup, score def score_ngrams(self, score_fn): """Returns a sequence of (ngram, score) pairs ordered from highest to lowest score, as determined by the scoring function provided. """ return sorted(self._score_ngrams(score_fn), key=lambda t: (-t[1], t[0])) def nbest(self, score_fn, n): """Returns the top n ngrams when scored by the given function.""" return [p for p, s in self.score_ngrams(score_fn)[:n]] def above_score(self, score_fn, min_score): """Returns a sequence of ngrams, ordered by decreasing score, whose scores each exceed the given minimum score. """ for ngram, score in self.score_ngrams(score_fn): if score > min_score: yield ngram else: break class BigramCollocationFinder(AbstractCollocationFinder): """A tool for the finding and ranking of bigram collocations or other association measures. It is often useful to use from_words() rather than constructing an instance directly. """ default_ws = 2 def __init__(self, word_fd, bigram_fd, window_size=2): """Construct a BigramCollocationFinder, given FreqDists for appearances of words and (possibly non-contiguous) bigrams. """ AbstractCollocationFinder.__init__(self, word_fd, bigram_fd) self.window_size = window_size @classmethod def from_words(cls, words, window_size=2): """Construct a BigramCollocationFinder for all bigrams in the given sequence. When window_size > 2, count non-contiguous bigrams, in the style of Church and Hanks's (1990) association ratio. """ wfd = FreqDist() bfd = FreqDist() if window_size < 2: raise ValueError("Specify window_size at least 2") for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue wfd[w1] += 1 for w2 in window[1:]: if w2 is not None: bfd[(w1, w2)] += 1 return cls(wfd, bfd, window_size=window_size) def score_ngram(self, score_fn, w1, w2): """Returns the score for a given bigram using the given scoring function. Following Church and Hanks (1990), counts are scaled by a factor of 1/(window_size - 1). """ n_all = self.N n_ii = self.ngram_fd[(w1, w2)] / (self.window_size - 1.0) if not n_ii: return n_ix = self.word_fd[w1] n_xi = self.word_fd[w2] return score_fn(n_ii, (n_ix, n_xi), n_all) class TrigramCollocationFinder(AbstractCollocationFinder): """A tool for the finding and ranking of trigram collocations or other association measures. It is often useful to use from_words() rather than constructing an instance directly. """ default_ws = 3 def __init__(self, word_fd, bigram_fd, wildcard_fd, trigram_fd): """Construct a TrigramCollocationFinder, given FreqDists for appearances of words, bigrams, two words with any word between them, and trigrams. """ AbstractCollocationFinder.__init__(self, word_fd, trigram_fd) self.wildcard_fd = wildcard_fd self.bigram_fd = bigram_fd @classmethod def from_words(cls, words, window_size=3): """Construct a TrigramCollocationFinder for all trigrams in the given sequence. """ if window_size < 3: raise ValueError("Specify window_size at least 3") wfd = FreqDist() wildfd = FreqDist() bfd = FreqDist() tfd = FreqDist() for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue for w2, w3 in _itertools.combinations(window[1:], 2): wfd[w1] += 1 if w2 is None: continue bfd[(w1, w2)] += 1 if w3 is None: continue wildfd[(w1, w3)] += 1 tfd[(w1, w2, w3)] += 1 return cls(wfd, bfd, wildfd, tfd) def bigram_finder(self): """Constructs a bigram collocation finder with the bigram and unigram data from this finder. Note that this does not include any filtering applied to this finder. """ return BigramCollocationFinder(self.word_fd, self.bigram_fd) def score_ngram(self, score_fn, w1, w2, w3): """Returns the score for a given trigram using the given scoring function. """ n_all = self.N n_iii = self.ngram_fd[(w1, w2, w3)] if not n_iii: return n_iix = self.bigram_fd[(w1, w2)] n_ixi = self.wildcard_fd[(w1, w3)] n_xii = self.bigram_fd[(w2, w3)] n_ixx = self.word_fd[w1] n_xix = self.word_fd[w2] n_xxi = self.word_fd[w3] return score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_all) class QuadgramCollocationFinder(AbstractCollocationFinder): """A tool for the finding and ranking of quadgram collocations or other association measures. It is often useful to use from_words() rather than constructing an instance directly. """ default_ws = 4 def __init__(self, word_fd, quadgram_fd, ii, iii, ixi, ixxi, iixi, ixii): """Construct a QuadgramCollocationFinder, given FreqDists for appearances of words, bigrams, trigrams, two words with one word and two words between them, three words with a word between them in both variations. """ AbstractCollocationFinder.__init__(self, word_fd, quadgram_fd) self.iii = iii self.ii = ii self.ixi = ixi self.ixxi = ixxi self.iixi = iixi self.ixii = ixii @classmethod def from_words(cls, words, window_size=4): if window_size < 4: raise ValueError("Specify window_size at least 4") ixxx = FreqDist() iiii = FreqDist() ii = FreqDist() iii = FreqDist() ixi = FreqDist() ixxi = FreqDist() iixi = FreqDist() ixii = FreqDist() for window in ngrams(words, window_size, pad_right=True): w1 = window[0] if w1 is None: continue for w2, w3, w4 in _itertools.combinations(window[1:], 3): ixxx[w1] += 1 if w2 is None: continue ii[(w1, w2)] += 1 if w3 is None: continue iii[(w1, w2, w3)] += 1 ixi[(w1, w3)] += 1 if w4 is None: continue iiii[(w1, w2, w3, w4)] += 1 ixxi[(w1, w4)] += 1 ixii[(w1, w3, w4)] += 1 iixi[(w1, w2, w4)] += 1 return cls(ixxx, iiii, ii, iii, ixi, ixxi, iixi, ixii) def score_ngram(self, score_fn, w1, w2, w3, w4): n_all = self.N n_iiii = self.ngram_fd[(w1, w2, w3, w4)] if not n_iiii: return n_iiix = self.iii[(w1, w2, w3)] n_xiii = self.iii[(w2, w3, w4)] n_iixi = self.iixi[(w1, w2, w4)] n_ixii = self.ixii[(w1, w3, w4)] n_iixx = self.ii[(w1, w2)] n_xxii = self.ii[(w3, w4)] n_xiix = self.ii[(w2, w3)] n_ixix = self.ixi[(w1, w3)] n_ixxi = self.ixxi[(w1, w4)] n_xixi = self.ixi[(w2, w4)] n_ixxx = self.word_fd[w1] n_xixx = self.word_fd[w2] n_xxix = self.word_fd[w3] n_xxxi = self.word_fd[w4] return score_fn(n_iiii, (n_iiix, n_iixi, n_ixii, n_xiii), (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), (n_ixxx, n_xixx, n_xxix, n_xxxi), n_all) def demo(scorer=None, compare_scorer=None): """Finds bigram collocations in the files of the WebText corpus.""" from nltk.metrics import BigramAssocMeasures, spearman_correlation, ranks_from_scores if scorer is None: scorer = BigramAssocMeasures.likelihood_ratio if compare_scorer is None: compare_scorer = BigramAssocMeasures.raw_freq from nltk.corpus import stopwords, webtext ignored_words = stopwords.words('english') word_filter = lambda w: len(w) < 3 or w.lower() in ignored_words for file in webtext.fileids(): words = [word.lower() for word in webtext.words(file)] cf = BigramCollocationFinder.from_words(words) cf.apply_freq_filter(3) cf.apply_word_filter(word_filter) corr = spearman_correlation(ranks_from_scores(cf.score_ngrams(scorer)), ranks_from_scores(cf.score_ngrams(compare_scorer))) print(file) print('\t', [' '.join(tup) for tup in cf.nbest(scorer, 15)]) print('\t Correlation to %s: %0.4f' % (compare_scorer.__name__, corr)) # Slows down loading too much # bigram_measures = BigramAssocMeasures() # trigram_measures = TrigramAssocMeasures() if __name__ == '__main__': import sys from nltk.metrics import BigramAssocMeasures try: scorer = eval('BigramAssocMeasures.' + sys.argv[1]) except IndexError: scorer = None try: compare_scorer = eval('BigramAssocMeasures.' + sys.argv[2]) except IndexError: compare_scorer = None demo(scorer, compare_scorer) __all__ = ['BigramCollocationFinder', 'TrigramCollocationFinder', 'QuadgramCollocationFinder'] nltk-3.1/nltk/compat.py0000755000076500000240000005601512607224144014675 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Compatibility # # Copyright (C) 2001-2015 NLTK Project # # URL: # For license information, see LICENSE.TXT from __future__ import absolute_import, print_function import sys import types from functools import wraps # Python 2/3 compatibility layer. Based on six. PY3 = sys.version_info[0] == 3 PY26 = sys.version_info[:2] == (2, 6) if PY3: def b(s): return s.encode("latin-1") def u(s): return s string_types = str, integer_types = int, class_types = type, text_type = str binary_type = bytes MAXSIZE = sys.maxsize get_im_class = lambda meth: meth.__self__.__class__ xrange = range _iterkeys = "keys" _itervalues = "values" _iteritems = "items" from imp import reload raw_input = input imap = map izip = zip import io StringIO = io.StringIO BytesIO = io.BytesIO import html.entities as htmlentitydefs from urllib.request import (urlopen, ProxyHandler, build_opener, install_opener, getproxies, HTTPPasswordMgrWithDefaultRealm, ProxyBasicAuthHandler, ProxyDigestAuthHandler, Request, url2pathname) from urllib.error import HTTPError, URLError from urllib.parse import quote_plus, unquote_plus, urlencode from collections import Counter from datetime import timezone UTC = timezone.utc from tempfile import TemporaryDirectory unichr = chr if sys.version_info[1] <= 1: def int2byte(i): return bytes((i,)) else: # This is about 2x faster than the implementation above on 3.2+ import operator int2byte = operator.methodcaller("to_bytes", 1, "big") else: def b(s): return s def u(s): return unicode(s, "unicode_escape") string_types = basestring, integer_types = (int, long) class_types = (type, types.ClassType) text_type = unicode binary_type = str get_im_class = lambda meth: meth.im_class xrange = xrange _iterkeys = "iterkeys" _itervalues = "itervalues" _iteritems = "iteritems" reload = reload raw_input = raw_input from itertools import imap, izip try: from cStringIO import StringIO except ImportError: from StringIO import StringIO BytesIO = StringIO import htmlentitydefs from urllib2 import (urlopen, HTTPError, URLError, ProxyHandler, build_opener, install_opener, HTTPPasswordMgrWithDefaultRealm, ProxyBasicAuthHandler, ProxyDigestAuthHandler, Request) from urllib import getproxies, quote_plus, unquote_plus, urlencode, url2pathname # Maps py2 tkinter package structure to py3 using import hook (PEP 302) class TkinterPackage(object): def __init__(self): self.mod = __import__("Tkinter") self.__path__ = ["nltk_py2_tkinter_package_path"] def __getattr__(self, name): return getattr(self.mod, name) class TkinterLoader(object): def __init__(self): # module name mapping from py3 to py2 self.module_map = { "tkinter": "Tkinter", "tkinter.filedialog": "tkFileDialog", "tkinter.font": "tkFont", "tkinter.messagebox": "tkMessageBox", } def find_module(self, name, path=None): # we are only interested in tkinter modules listed # in self.module_map if name in self.module_map: return self def load_module(self, name): if name not in sys.modules: if name == 'tkinter': mod = TkinterPackage() else: mod = __import__(self.module_map[name]) sys.modules[name] = mod return sys.modules[name] sys.meta_path.insert(0, TkinterLoader()) from datetime import tzinfo, timedelta ZERO = timedelta(0) HOUR = timedelta(hours=1) # A UTC class for python 2.7 class UTC(tzinfo): """UTC""" def utcoffset(self, dt): return ZERO def tzname(self, dt): return "UTC" def dst(self, dt): return ZERO UTC = UTC() unichr = unichr int2byte = chr import csv import codecs import cStringIO class UnicodeWriter: """ A CSV writer which will write rows to CSV file "f", which is encoded in the given encoding. see https://docs.python.org/2/library/csv.html """ def __init__(self, f, dialect=csv.excel, encoding="utf-8", errors='replace', **kwds): # Redirect output to a queue self.queue = cStringIO.StringIO() self.writer = csv.writer(self.queue, dialect=dialect, **kwds) self.stream = f encoder_cls = codecs.getincrementalencoder(encoding) self.encoder = encoder_cls(errors=errors) def encode(self, data): if isinstance(data, basestring): return data.encode("utf-8") else: return data def writerow(self, row): self.writer.writerow([self.encode(s) for s in row]) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() data = data.decode("utf-8") # ... and reencode it into the target encoding data = self.encoder.encode(data, 'replace') # write to the target stream self.stream.write(data) # empty queue self.queue.truncate(0) import warnings as _warnings import os as _os from tempfile import mkdtemp class TemporaryDirectory(object): """Create and return a temporary directory. This has the same behavior as mkdtemp but can be used as a context manager. For example: with TemporaryDirectory() as tmpdir: ... Upon exiting the context, the directory and everything contained in it are removed. http://stackoverflow.com/questions/19296146/tempfile-temporarydirectory-context-manager-in-python-2-7 """ def __init__(self, suffix="", prefix="tmp", dir=None): self._closed = False self.name = None # Handle mkdtemp raising an exception self.name = mkdtemp(suffix, prefix, dir) def __repr__(self): return "<{} {!r}>".format(self.__class__.__name__, self.name) def __enter__(self): return self.name def cleanup(self, _warn=False): if self.name and not self._closed: try: self._rmtree(self.name) except (TypeError, AttributeError) as ex: # Issue #10188: Emit a warning on stderr # if the directory could not be cleaned # up due to missing globals if "None" not in str(ex): raise print("ERROR: {!r} while cleaning up {!r}".format(ex, self,), file=sys.stderr) return self._closed = True if _warn: self._warn("Implicitly cleaning up {!r}".format(self), ResourceWarning) def __exit__(self, exc, value, tb): self.cleanup() def __del__(self): # Issue a ResourceWarning if implicit cleanup needed self.cleanup(_warn=True) # XXX (ncoghlan): The following code attempts to make # this class tolerant of the module nulling out process # that happens during CPython interpreter shutdown # Alas, it doesn't actually manage it. See issue #10188 _listdir = staticmethod(_os.listdir) _path_join = staticmethod(_os.path.join) _isdir = staticmethod(_os.path.isdir) _islink = staticmethod(_os.path.islink) _remove = staticmethod(_os.remove) _rmdir = staticmethod(_os.rmdir) _warn = _warnings.warn def _rmtree(self, path): # Essentially a stripped down version of shutil.rmtree. We can't # use globals because they may be None'ed out at shutdown. for name in self._listdir(path): fullname = self._path_join(path, name) try: isdir = self._isdir(fullname) and not self._islink(fullname) except OSError: isdir = False if isdir: self._rmtree(fullname) else: try: self._remove(fullname) except OSError: pass try: self._rmdir(path) except OSError: pass if PY26: from operator import itemgetter from heapq import nlargest from itertools import repeat, ifilter class Counter(dict): '''Dict subclass for counting hashable objects. Sometimes called a bag or multiset. Elements are stored as dictionary keys and their counts are stored as dictionary values. >>> Counter('zyzygy') Counter({'y': 3, 'z': 2, 'g': 1}) ''' def __init__(self, iterable=None, **kwds): '''Create a new, empty Counter object. And if given, count elements from an input iterable. Or, initialize the count from another mapping of elements to their counts. >>> Counter() # a new, empty counter >>> Counter('gallahad') # a new counter from an iterable >>> Counter({'a': 4, 'b': 2}) # a new counter from a mapping >>> Counter(a=4, b=2) # a new counter from keyword args ''' self.update(iterable, **kwds) def __missing__(self, key): return 0 def most_common(self, n=None): '''List the n most common elements and their counts from the most common to the least. If n is None, then list all element counts. >>> Counter('abracadabra').most_common(3) [('a', 5), ('r', 2), ('b', 2)] ''' if n is None: return sorted(self.iteritems(), key=itemgetter(1), reverse=True) return nlargest(n, self.iteritems(), key=itemgetter(1)) def elements(self): '''Iterator over elements repeating each as many times as its count. >>> c = Counter('ABCABC') >>> sorted(c.elements()) ['A', 'A', 'B', 'B', 'C', 'C'] If an element's count has been set to zero or is a negative number, elements() will ignore it. ''' for elem, count in self.iteritems(): for _ in repeat(None, count): yield elem # Override dict methods where the meaning changes for Counter # objects. @classmethod def fromkeys(cls, iterable, v=None): raise NotImplementedError( 'Counter.fromkeys() is undefined. Use Counter(iterable) instead.') def update(self, iterable=None, **kwds): '''Like dict.update() but add counts instead of replacing them. Source can be an iterable, a dictionary, or another Counter instance. >>> c = Counter('which') >>> c.update('witch') # add elements from another iterable >>> d = Counter('watch') >>> c.update(d) # add elements from another counter >>> c['h'] # four 'h' in which, witch, and watch 4 ''' if iterable is not None: if hasattr(iterable, 'iteritems'): if self: self_get = self.get for elem, count in iterable.iteritems(): self[elem] = self_get(elem, 0) + count else: # fast path when counter is empty dict.update(self, iterable) else: self_get = self.get for elem in iterable: self[elem] = self_get(elem, 0) + 1 if kwds: self.update(kwds) def copy(self): 'Like dict.copy() but returns a Counter instance instead of a dict.' return Counter(self) def __delitem__(self, elem): 'Like dict.__delitem__() but does not raise KeyError for missing values.' if elem in self: dict.__delitem__(self, elem) def __repr__(self): if not self: return '%s()' % self.__class__.__name__ items = ', '.join(map('%r: %r'.__mod__, self.most_common())) return '%s({%s})' % (self.__class__.__name__, items) # Multiset-style mathematical operations discussed in: # Knuth TAOCP Volume II section 4.6.3 exercise 19 # and at http://en.wikipedia.org/wiki/Multiset # # Outputs guaranteed to only include positive counts. # # To strip negative and zero counts, add-in an empty counter: # c += Counter() def __add__(self, other): '''Add counts from two counters. >>> Counter('abbb') + Counter('bcc') Counter({'b': 4, 'c': 2, 'a': 1}) ''' if not isinstance(other, Counter): return NotImplemented result = Counter() for elem in set(self) | set(other): newcount = self[elem] + other[elem] if newcount > 0: result[elem] = newcount return result def __sub__(self, other): ''' Subtract count, but keep only results with positive counts. >>> Counter('abbbc') - Counter('bccd') Counter({'b': 2, 'a': 1}) ''' if not isinstance(other, Counter): return NotImplemented result = Counter() for elem in set(self) | set(other): newcount = self[elem] - other[elem] if newcount > 0: result[elem] = newcount return result def __or__(self, other): '''Union is the maximum of value in either of the input counters. >>> Counter('abbb') | Counter('bcc') Counter({'b': 3, 'c': 2, 'a': 1}) ''' if not isinstance(other, Counter): return NotImplemented _max = max result = Counter() for elem in set(self) | set(other): newcount = _max(self[elem], other[elem]) if newcount > 0: result[elem] = newcount return result def __and__(self, other): ''' Intersection is the minimum of corresponding counts. >>> Counter('abbb') & Counter('bcc') Counter({'b': 1}) ''' if not isinstance(other, Counter): return NotImplemented _min = min result = Counter() if len(self) < len(other): self, other = other, self for elem in ifilter(self.__contains__, other): newcount = _min(self[elem], other[elem]) if newcount > 0: result[elem] = newcount return result else: from collections import Counter def iterkeys(d): """Return an iterator over the keys of a dictionary.""" return getattr(d, _iterkeys)() def itervalues(d): """Return an iterator over the values of a dictionary.""" return getattr(d, _itervalues)() def iteritems(d): """Return an iterator over the (key, value) pairs of a dictionary.""" return getattr(d, _iteritems)() try: from functools import total_ordering except ImportError: # python 2.6 def total_ordering(cls): """Class decorator that fills in missing ordering methods""" convert = { '__lt__': [('__gt__', lambda self, other: not (self < other or self == other)), ('__le__', lambda self, other: self < other or self == other), ('__ge__', lambda self, other: not self < other)], '__le__': [('__ge__', lambda self, other: not self <= other or self == other), ('__lt__', lambda self, other: self <= other and not self == other), ('__gt__', lambda self, other: not self <= other)], '__gt__': [('__lt__', lambda self, other: not (self > other or self == other)), ('__ge__', lambda self, other: self > other or self == other), ('__le__', lambda self, other: not self > other)], '__ge__': [('__le__', lambda self, other: (not self >= other) or self == other), ('__gt__', lambda self, other: self >= other and not self == other), ('__lt__', lambda self, other: not self >= other)] } roots = set(dir(cls)) & set(convert) if not roots: raise ValueError( 'must define at least one ordering operation: < > <= >=') root = max(roots) # prefer __lt__ to __le__ to __gt__ to __ge__ for opname, opfunc in convert[root]: if opname not in roots: opfunc.__name__ = opname opfunc.__doc__ = getattr(int, opname).__doc__ setattr(cls, opname, opfunc) return cls # ======= Compatibility for datasets that care about Python versions ======== # The following datasets have a /PY3 subdirectory containing # a full copy of the data which has been re-encoded or repickled. _PY3_DATA_UPDATES = [] if sys.platform.startswith('win'): _PY3_DATA_UPDATES = ["chunkers\maxent_ne_chunker", "help\tagsets", "taggers\maxent_treebank_pos_tagger", "tokenizers\punkt"] else: _PY3_DATA_UPDATES = ["chunkers/maxent_ne_chunker", "help/tagsets", "taggers/maxent_treebank_pos_tagger", "tokenizers/punkt"] def add_py3_data(path): if PY3: for item in _PY3_DATA_UPDATES: if item in str(path) and "/PY3" not in str(path): pos = path.index(item) + len(item) if path[pos:pos + 4] == ".zip": pos += 4 path = path[:pos] + "/PY3" + path[pos:] break return path # for use in adding /PY3 to the second (filename) argument # of the file pointers in data.py def py3_data(init_func): def _decorator(*args, **kwargs): args = (args[0], add_py3_data(args[1])) + args[2:] return init_func(*args, **kwargs) return wraps(init_func)(_decorator) # ======= Compatibility layer for __str__ and __repr__ ========== import unicodedata import functools def remove_accents(text): if isinstance(text, bytes): text = text.decode('ascii') category = unicodedata.category # this gives a small (~10%) speedup return ''.join( c for c in unicodedata.normalize('NFKD', text) if category(c) != 'Mn' ) # Select the best transliteration method: try: # Older versions of Unidecode are licensed under Artistic License; # assume an older version is installed. from unidecode import unidecode as transliterate except ImportError: try: # text-unidecode implementation is worse than Unidecode # implementation so Unidecode is preferred. from text_unidecode import unidecode as transliterate except ImportError: # This transliteration method should be enough # for many Western languages. transliterate = remove_accents def python_2_unicode_compatible(klass): """ This decorator defines __unicode__ method and fixes __repr__ and __str__ methods under Python 2. To support Python 2 and 3 with a single code base, define __str__ and __repr__ methods returning unicode text and apply this decorator to the class. Original __repr__ and __str__ would be available as unicode_repr and __unicode__ (under both Python 2 and Python 3). """ if not issubclass(klass, object): raise ValueError("This decorator doesn't work for old-style classes") # both __unicode__ and unicode_repr are public because they # may be useful in console under Python 2.x # if __str__ or __repr__ are not overriden in a subclass, # they may be already fixed by this decorator in a parent class # and we shouldn't them again if not _was_fixed(klass.__str__): klass.__unicode__ = klass.__str__ if not PY3: klass.__str__ = _7bit(_transliterated(klass.__unicode__)) if not _was_fixed(klass.__repr__): klass.unicode_repr = klass.__repr__ if not PY3: klass.__repr__ = _7bit(klass.unicode_repr) return klass def unicode_repr(obj): """ For classes that was fixed with @python_2_unicode_compatible ``unicode_repr`` returns ``obj.unicode_repr()``; for unicode strings the result is returned without "u" letter (to make output the same under Python 2.x and Python 3.x); for other variables it is the same as ``repr``. """ if PY3: return repr(obj) # Python 2.x if hasattr(obj, 'unicode_repr'): return obj.unicode_repr() if isinstance(obj, unicode): return repr(obj)[1:] # strip "u" letter from output return repr(obj) def _transliterated(method): def wrapper(self): return transliterate(method(self)) functools.update_wrapper(wrapper, method, ["__name__", "__doc__"]) if hasattr(method, "_nltk_compat_7bit"): wrapper._nltk_compat_7bit = method._nltk_compat_7bit wrapper._nltk_compat_transliterated = True return wrapper def _7bit(method): def wrapper(self): return method(self).encode('ascii', 'backslashreplace') functools.update_wrapper(wrapper, method, ["__name__", "__doc__"]) if hasattr(method, "_nltk_compat_transliterated"): wrapper._nltk_compat_transliterated = method._nltk_compat_transliterated wrapper._nltk_compat_7bit = True return wrapper def _was_fixed(method): return (getattr(method, "_nltk_compat_7bit", False) or getattr(method, "_nltk_compat_transliterated", False)) nltk-3.1/nltk/corpus/0000755000076500000240000000000012610001541014325 5ustar sbstaff00000000000000nltk-3.1/nltk/corpus/__init__.py0000644000076500000240000003156512607224144016464 0ustar sbstaff00000000000000# Natural Language Toolkit: Corpus Readers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT # TODO this docstring isn't up-to-date! """ NLTK corpus readers. The modules in this package provide functions that can be used to read corpus files in a variety of formats. These functions can be used to read both the corpus files that are distributed in the NLTK corpus package, and corpus files that are part of external corpora. Available Corpora ================= Please see http://www.nltk.org/nltk_data/ for a complete list. Install corpora using nltk.download(). Corpus Reader Functions ======================= Each corpus module defines one or more "corpus reader functions", which can be used to read documents from that corpus. These functions take an argument, ``item``, which is used to indicate which document should be read from the corpus: - If ``item`` is one of the unique identifiers listed in the corpus module's ``items`` variable, then the corresponding document will be loaded from the NLTK corpus package. - If ``item`` is a filename, then that file will be read. Additionally, corpus reader functions can be given lists of item names; in which case, they will return a concatenation of the corresponding documents. Corpus reader functions are named based on the type of information they return. Some common examples, and their return types, are: - words(): list of str - sents(): list of (list of str) - paras(): list of (list of (list of str)) - tagged_words(): list of (str,str) tuple - tagged_sents(): list of (list of (str,str)) - tagged_paras(): list of (list of (list of (str,str))) - chunked_sents(): list of (Tree w/ (str,str) leaves) - parsed_sents(): list of (Tree with str leaves) - parsed_paras(): list of (list of (Tree with str leaves)) - xml(): A single xml ElementTree - raw(): unprocessed corpus contents For example, to read a list of the words in the Brown Corpus, use ``nltk.corpus.brown.words()``: >>> from nltk.corpus import brown >>> print(", ".join(brown.words())) The, Fulton, County, Grand, Jury, said, ... """ import re from nltk.tokenize import RegexpTokenizer from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import * abc = LazyCorpusLoader( 'abc', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[ ('science', 'latin_1'), ('rural', 'utf8')]) alpino = LazyCorpusLoader( 'alpino', AlpinoCorpusReader, tagset='alpino') brown = LazyCorpusLoader( 'brown', CategorizedTaggedCorpusReader, r'c[a-z]\d\d', cat_file='cats.txt', tagset='brown', encoding="ascii") cess_cat = LazyCorpusLoader( 'cess_cat', BracketParseCorpusReader, r'(?!\.).*\.tbf', tagset='unknown', encoding='ISO-8859-15') cess_esp = LazyCorpusLoader( 'cess_esp', BracketParseCorpusReader, r'(?!\.).*\.tbf', tagset='unknown', encoding='ISO-8859-15') cmudict = LazyCorpusLoader( 'cmudict', CMUDictCorpusReader, ['cmudict']) comtrans = LazyCorpusLoader( 'comtrans', AlignedCorpusReader, r'(?!\.).*\.txt') comparative_sentences = LazyCorpusLoader( 'comparative_sentences', ComparativeSentencesCorpusReader, r'labeledSentences\.txt', encoding='latin-1') conll2000 = LazyCorpusLoader( 'conll2000', ConllChunkCorpusReader, ['train.txt', 'test.txt'], ('NP','VP','PP'), tagset='wsj', encoding='ascii') conll2002 = LazyCorpusLoader( 'conll2002', ConllChunkCorpusReader, '.*\.(test|train).*', ('LOC', 'PER', 'ORG', 'MISC'), encoding='utf-8') conll2007 = LazyCorpusLoader( 'conll2007', DependencyCorpusReader, '.*\.(test|train).*', encoding=[ ('eus', 'ISO-8859-2'), ('esp', 'utf8')]) crubadan = LazyCorpusLoader( 'crubadan', CrubadanCorpusReader, '.*\.txt') dependency_treebank = LazyCorpusLoader( 'dependency_treebank', DependencyCorpusReader, '.*\.dp', encoding='ascii') floresta = LazyCorpusLoader( 'floresta', BracketParseCorpusReader, r'(?!\.).*\.ptb', '#', tagset='unknown', encoding='ISO-8859-15') framenet = LazyCorpusLoader( 'framenet_v15', FramenetCorpusReader, ['frRelation.xml','frameIndex.xml','fulltextIndex.xml','luIndex.xml','semTypes.xml']) gazetteers = LazyCorpusLoader( 'gazetteers', WordListCorpusReader, r'(?!LICENSE|\.).*\.txt', encoding='ISO-8859-2') genesis = LazyCorpusLoader( 'genesis', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding=[ ('finnish|french|german', 'latin_1'), ('swedish', 'cp865'), ('.*', 'utf_8')]) gutenberg = LazyCorpusLoader( 'gutenberg', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1') # corpus not available with NLTK; these lines caused help(nltk.corpus) to break #hebrew_treebank = LazyCorpusLoader( # 'hebrew_treebank', BracketParseCorpusReader, r'.*\.txt') ieer = LazyCorpusLoader( 'ieer', IEERCorpusReader, r'(?!README|\.).*') inaugural = LazyCorpusLoader( 'inaugural', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='latin1') # [XX] This should probably just use TaggedCorpusReader: indian = LazyCorpusLoader( 'indian', IndianCorpusReader, r'(?!\.).*\.pos', tagset='unknown', encoding='utf8') ipipan = LazyCorpusLoader( 'ipipan', IPIPANCorpusReader, r'(?!\.).*morph\.xml') jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*\.chasen', encoding='utf-8') knbc = LazyCorpusLoader( 'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp') lin_thesaurus = LazyCorpusLoader( 'lin_thesaurus', LinThesaurusCorpusReader, r'.*\.lsp') mac_morpho = LazyCorpusLoader( 'mac_morpho', MacMorphoCorpusReader, r'(?!\.).*\.txt', tagset='unknown', encoding='latin-1') machado = LazyCorpusLoader( 'machado', PortugueseCategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'([a-z]*)/.*', encoding='latin-1') masc_tagged = LazyCorpusLoader( 'masc_tagged', CategorizedTaggedCorpusReader, r'(spoken|written)/.*\.txt', cat_file='categories.txt', tagset='wsj', encoding="utf-8", sep="_") movie_reviews = LazyCorpusLoader( 'movie_reviews', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') multext_east = LazyCorpusLoader( 'mte_teip5', MTECorpusReader, r'(oana).*\.xml', encoding="utf-8") names = LazyCorpusLoader( 'names', WordListCorpusReader, r'(?!\.).*\.txt', encoding='ascii') nkjp = LazyCorpusLoader( 'nkjp', NKJPCorpusReader, r'', encoding='utf8') nps_chat = LazyCorpusLoader( 'nps_chat', NPSChatCorpusReader, r'(?!README|\.).*\.xml', tagset='wsj') opinion_lexicon = LazyCorpusLoader( 'opinion_lexicon', OpinionLexiconCorpusReader, r'(\w+)\-words\.txt', encoding='ISO-8859-2') pl196x = LazyCorpusLoader( 'pl196x', Pl196xCorpusReader, r'[a-z]-.*\.xml', cat_file='cats.txt', textid_file='textids.txt', encoding='utf8') ppattach = LazyCorpusLoader( 'ppattach', PPAttachmentCorpusReader, ['training', 'test', 'devset']) product_reviews_1 = LazyCorpusLoader( 'product_reviews_1', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8') product_reviews_2 = LazyCorpusLoader( 'product_reviews_2', ReviewsCorpusReader, r'^(?!Readme).*\.txt', encoding='utf8') pros_cons = LazyCorpusLoader( 'pros_cons', ProsConsCorpusReader, r'Integrated(Cons|Pros)\.txt', cat_pattern=r'Integrated(Cons|Pros)\.txt', encoding='ISO-8859-2') ptb = LazyCorpusLoader( # Penn Treebank v3: WSJ and Brown portions 'ptb', CategorizedBracketParseCorpusReader, r'(WSJ/\d\d/WSJ_\d\d|BROWN/C[A-Z]/C[A-Z])\d\d.MRG', cat_file='allcats.txt', tagset='wsj') qc = LazyCorpusLoader( 'qc', StringCategoryCorpusReader, ['train.txt', 'test.txt'], encoding='ISO-8859-2') reuters = LazyCorpusLoader( 'reuters', CategorizedPlaintextCorpusReader, '(training|test).*', cat_file='cats.txt', encoding='ISO-8859-2') rte = LazyCorpusLoader( 'rte', RTECorpusReader, r'(?!\.).*\.xml') senseval = LazyCorpusLoader( 'senseval', SensevalCorpusReader, r'(?!\.).*\.pos') sentence_polarity = LazyCorpusLoader( 'sentence_polarity', CategorizedSentencesCorpusReader, r'rt-polarity\.(neg|pos)', cat_pattern=r'rt-polarity\.(neg|pos)', encoding='utf-8') sentiwordnet = LazyCorpusLoader( 'sentiwordnet', SentiWordNetCorpusReader, 'SentiWordNet_3.0.0.txt', encoding='utf-8') shakespeare = LazyCorpusLoader( 'shakespeare', XMLCorpusReader, r'(?!\.).*\.xml') sinica_treebank = LazyCorpusLoader( 'sinica_treebank', SinicaTreebankCorpusReader, ['parsed'], tagset='unknown', encoding='utf-8') state_union = LazyCorpusLoader( 'state_union', PlaintextCorpusReader, r'(?!\.).*\.txt', encoding='ISO-8859-2') stopwords = LazyCorpusLoader( 'stopwords', WordListCorpusReader, r'(?!README|\.).*', encoding='utf8') subjectivity = LazyCorpusLoader( 'subjectivity', CategorizedSentencesCorpusReader, r'(quote.tok.gt9|plot.tok.gt9)\.5000', cat_map={'quote.tok.gt9.5000':['subj'], 'plot.tok.gt9.5000':['obj']}, encoding='latin-1') swadesh = LazyCorpusLoader( 'swadesh', SwadeshCorpusReader, r'(?!README|\.).*', encoding='utf8') swadesh110 = LazyCorpusLoader( 'panlex_swadesh', SwadeshCorpusReader, r'swadesh110/.*\.txt', encoding='utf8') swadesh207 = LazyCorpusLoader( 'panlex_swadesh', SwadeshCorpusReader, r'swadesh207/.*\.txt', encoding='utf8') switchboard = LazyCorpusLoader( 'switchboard', SwitchboardCorpusReader, tagset='wsj') timit = LazyCorpusLoader( 'timit', TimitCorpusReader) timit_tagged = LazyCorpusLoader( 'timit', TimitTaggedCorpusReader, '.+\.tags', tagset='wsj', encoding='ascii') toolbox = LazyCorpusLoader( 'toolbox', ToolboxCorpusReader, r'(?!.*(README|\.)).*\.(dic|txt)') treebank = LazyCorpusLoader( 'treebank/combined', BracketParseCorpusReader, r'wsj_.*\.mrg', tagset='wsj', encoding='ascii') treebank_chunk = LazyCorpusLoader( 'treebank/tagged', ChunkedCorpusReader, r'wsj_.*\.pos', sent_tokenizer=RegexpTokenizer(r'(?<=/\.)\s*(?![^\[]*\])', gaps=True), para_block_reader=tagged_treebank_para_block_reader, tagset='wsj', encoding='ascii') treebank_raw = LazyCorpusLoader( 'treebank/raw', PlaintextCorpusReader, r'wsj_.*', encoding='ISO-8859-2') twitter_samples = LazyCorpusLoader( 'twitter_samples', TwitterCorpusReader, '.*\.json') udhr = LazyCorpusLoader( 'udhr', UdhrCorpusReader) udhr2 = LazyCorpusLoader( 'udhr2', PlaintextCorpusReader, r'.*\.txt', encoding='utf8') universal_treebanks = LazyCorpusLoader( 'universal_treebanks_v20', ConllCorpusReader, r'.*\.conll', columntypes = ('ignore', 'words', 'ignore', 'ignore', 'pos', 'ignore', 'ignore', 'ignore', 'ignore', 'ignore')) verbnet = LazyCorpusLoader( 'verbnet', VerbnetCorpusReader, r'(?!\.).*\.xml') webtext = LazyCorpusLoader( 'webtext', PlaintextCorpusReader, r'(?!README|\.).*\.txt', encoding='ISO-8859-2') wordnet = LazyCorpusLoader( 'wordnet', WordNetCorpusReader, LazyCorpusLoader('omw', CorpusReader, r'.*/wn-data-.*\.tab', encoding='utf8')) wordnet_ic = LazyCorpusLoader( 'wordnet_ic', WordNetICCorpusReader, '.*\.dat') words = LazyCorpusLoader( 'words', WordListCorpusReader, r'(?!README|\.).*', encoding='ascii') ycoe = LazyCorpusLoader( 'ycoe', YCOECorpusReader) # defined after treebank propbank = LazyCorpusLoader( 'propbank', PropbankCorpusReader, 'prop.txt', 'frames/.*\.xml', 'verbs.txt', lambda filename: re.sub(r'^wsj/\d\d/', '', filename), treebank) # Must be defined *after* treebank corpus. nombank = LazyCorpusLoader( 'nombank.1.0', NombankCorpusReader, 'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words', lambda filename: re.sub(r'^wsj/\d\d/', '', filename), treebank) # Must be defined *after* treebank corpus. propbank_ptb = LazyCorpusLoader( 'propbank', PropbankCorpusReader, 'prop.txt', 'frames/.*\.xml', 'verbs.txt', lambda filename: filename.upper(), ptb) # Must be defined *after* ptb corpus. nombank_ptb = LazyCorpusLoader( 'nombank.1.0', NombankCorpusReader, 'nombank.1.0', 'frames/.*\.xml', 'nombank.1.0.words', lambda filename: filename.upper(), ptb) # Must be defined *after* ptb corpus. semcor = LazyCorpusLoader( 'semcor', SemcorCorpusReader, r'brown./tagfiles/br-.*\.xml', wordnet) # Must be defined *after* wordnet corpus. def demo(): # This is out-of-date: abc.demo() brown.demo() # chat80.demo() cmudict.demo() conll2000.demo() conll2002.demo() genesis.demo() gutenberg.demo() ieer.demo() inaugural.demo() indian.demo() names.demo() ppattach.demo() senseval.demo() shakespeare.demo() sinica_treebank.demo() state_union.demo() stopwords.demo() timit.demo() toolbox.demo() treebank.demo() udhr.demo() webtext.demo() words.demo() # ycoe.demo() if __name__ == '__main__': #demo() pass # ** this is for nose ** # unload all corpus after tests def teardown_module(module=None): import nltk.corpus for name in dir(nltk.corpus): obj = getattr(nltk.corpus, name, None) if isinstance(obj, CorpusReader) and hasattr(obj, '_unload'): obj._unload() nltk-3.1/nltk/corpus/europarl_raw.py0000644000076500000240000000305512607224144017420 0ustar sbstaff00000000000000# Natural Language Toolkit: Europarl Corpus Readers # # Copyright (C) 2001-2015 NLTK Project # Author: Nitin Madnani # URL: # For license information, see LICENSE.TXT import re from nltk.corpus.util import LazyCorpusLoader from nltk.corpus.reader import * # Create a new corpus reader instance for each European language danish = LazyCorpusLoader( 'europarl_raw/danish', EuroparlCorpusReader, r'ep-.*\.da', encoding='utf-8') dutch = LazyCorpusLoader( 'europarl_raw/dutch', EuroparlCorpusReader, r'ep-.*\.nl', encoding='utf-8') english = LazyCorpusLoader( 'europarl_raw/english', EuroparlCorpusReader, r'ep-.*\.en', encoding='utf-8') finnish = LazyCorpusLoader( 'europarl_raw/finnish', EuroparlCorpusReader, r'ep-.*\.fi', encoding='utf-8') french = LazyCorpusLoader( 'europarl_raw/french', EuroparlCorpusReader, r'ep-.*\.fr', encoding='utf-8') german = LazyCorpusLoader( 'europarl_raw/german', EuroparlCorpusReader, r'ep-.*\.de', encoding='utf-8') greek = LazyCorpusLoader( 'europarl_raw/greek', EuroparlCorpusReader, r'ep-.*\.el', encoding='utf-8') italian = LazyCorpusLoader( 'europarl_raw/italian', EuroparlCorpusReader, r'ep-.*\.it', encoding='utf-8') portuguese = LazyCorpusLoader( 'europarl_raw/portuguese', EuroparlCorpusReader, r'ep-.*\.pt', encoding='utf-8') spanish = LazyCorpusLoader( 'europarl_raw/spanish', EuroparlCorpusReader, r'ep-.*\.es', encoding='utf-8') swedish = LazyCorpusLoader( 'europarl_raw/swedish', EuroparlCorpusReader, r'ep-.*\.sv', encoding='utf-8') nltk-3.1/nltk/corpus/reader/0000755000076500000240000000000012610001541015567 5ustar sbstaff00000000000000nltk-3.1/nltk/corpus/reader/__init__.py0000644000076500000240000001356612607224144017727 0ustar sbstaff00000000000000# Natural Language Toolkit: Corpus Readers # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ NLTK corpus readers. The modules in this package provide functions that can be used to read corpus fileids in a variety of formats. These functions can be used to read both the corpus fileids that are distributed in the NLTK corpus package, and corpus fileids that are part of external corpora. Corpus Reader Functions ======================= Each corpus module defines one or more "corpus reader functions", which can be used to read documents from that corpus. These functions take an argument, ``item``, which is used to indicate which document should be read from the corpus: - If ``item`` is one of the unique identifiers listed in the corpus module's ``items`` variable, then the corresponding document will be loaded from the NLTK corpus package. - If ``item`` is a fileid, then that file will be read. Additionally, corpus reader functions can be given lists of item names; in which case, they will return a concatenation of the corresponding documents. Corpus reader functions are named based on the type of information they return. Some common examples, and their return types, are: - words(): list of str - sents(): list of (list of str) - paras(): list of (list of (list of str)) - tagged_words(): list of (str,str) tuple - tagged_sents(): list of (list of (str,str)) - tagged_paras(): list of (list of (list of (str,str))) - chunked_sents(): list of (Tree w/ (str,str) leaves) - parsed_sents(): list of (Tree with str leaves) - parsed_paras(): list of (list of (Tree with str leaves)) - xml(): A single xml ElementTree - raw(): unprocessed corpus contents For example, to read a list of the words in the Brown Corpus, use ``nltk.corpus.brown.words()``: >>> from nltk.corpus import brown >>> print(", ".join(brown.words())) The, Fulton, County, Grand, Jury, said, ... """ from nltk.corpus.reader.plaintext import * from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * from nltk.corpus.reader.tagged import * from nltk.corpus.reader.cmudict import * from nltk.corpus.reader.conll import * from nltk.corpus.reader.chunked import * from nltk.corpus.reader.wordlist import * from nltk.corpus.reader.xmldocs import * from nltk.corpus.reader.ppattach import * from nltk.corpus.reader.senseval import * from nltk.corpus.reader.ieer import * from nltk.corpus.reader.sinica_treebank import * from nltk.corpus.reader.bracket_parse import * from nltk.corpus.reader.indian import * from nltk.corpus.reader.toolbox import * from nltk.corpus.reader.timit import * from nltk.corpus.reader.ycoe import * from nltk.corpus.reader.rte import * from nltk.corpus.reader.string_category import * from nltk.corpus.reader.propbank import * from nltk.corpus.reader.verbnet import * from nltk.corpus.reader.bnc import * from nltk.corpus.reader.nps_chat import * from nltk.corpus.reader.wordnet import * from nltk.corpus.reader.switchboard import * from nltk.corpus.reader.dependency import * from nltk.corpus.reader.nombank import * from nltk.corpus.reader.ipipan import * from nltk.corpus.reader.pl196x import * from nltk.corpus.reader.knbc import * from nltk.corpus.reader.chasen import * from nltk.corpus.reader.childes import * from nltk.corpus.reader.aligned import * from nltk.corpus.reader.lin import * from nltk.corpus.reader.semcor import * from nltk.corpus.reader.framenet import * from nltk.corpus.reader.udhr import * from nltk.corpus.reader.bnc import * from nltk.corpus.reader.sentiwordnet import * from nltk.corpus.reader.twitter import * from nltk.corpus.reader.nkjp import * from nltk.corpus.reader.crubadan import * from nltk.corpus.reader.mte import * from nltk.corpus.reader.reviews import * from nltk.corpus.reader.opinion_lexicon import * from nltk.corpus.reader.pros_cons import * from nltk.corpus.reader.categorized_sents import * from nltk.corpus.reader.comparative_sents import * # Make sure that nltk.corpus.reader.bracket_parse gives the module, not # the function bracket_parse() defined in nltk.tree: from nltk.corpus.reader import bracket_parse __all__ = [ 'CorpusReader', 'CategorizedCorpusReader', 'PlaintextCorpusReader', 'find_corpus_fileids', 'TaggedCorpusReader', 'CMUDictCorpusReader', 'ConllChunkCorpusReader', 'WordListCorpusReader', 'PPAttachmentCorpusReader', 'SensevalCorpusReader', 'IEERCorpusReader', 'ChunkedCorpusReader', 'SinicaTreebankCorpusReader', 'BracketParseCorpusReader', 'IndianCorpusReader', 'ToolboxCorpusReader', 'TimitCorpusReader', 'YCOECorpusReader', 'MacMorphoCorpusReader', 'SyntaxCorpusReader', 'AlpinoCorpusReader', 'RTECorpusReader', 'StringCategoryCorpusReader','EuroparlCorpusReader', 'CategorizedBracketParseCorpusReader', 'CategorizedTaggedCorpusReader', 'CategorizedPlaintextCorpusReader', 'PortugueseCategorizedPlaintextCorpusReader', 'tagged_treebank_para_block_reader', 'PropbankCorpusReader', 'VerbnetCorpusReader', 'BNCCorpusReader', 'ConllCorpusReader', 'XMLCorpusReader', 'NPSChatCorpusReader', 'SwadeshCorpusReader', 'WordNetCorpusReader', 'WordNetICCorpusReader', 'SwitchboardCorpusReader', 'DependencyCorpusReader', 'NombankCorpusReader', 'IPIPANCorpusReader', 'Pl196xCorpusReader', 'TEICorpusView', 'KNBCorpusReader', 'ChasenCorpusReader', 'CHILDESCorpusReader', 'AlignedCorpusReader', 'TimitTaggedCorpusReader', 'LinThesaurusCorpusReader', 'SemcorCorpusReader', 'FramenetCorpusReader', 'UdhrCorpusReader', 'BNCCorpusReader', 'SentiWordNetCorpusReader', 'SentiSynset', 'TwitterCorpusReader', 'NKJPCorpusReader', 'CrubadanCorpusReader', 'MTECorpusReader', 'ReviewsCorpusReader', 'OpinionLexiconCorpusReader', 'ProsConsCorpusReader', 'CategorizedSentencesCorpusReader', 'ComparativeSentencesCorpusReader' ] nltk-3.1/nltk/corpus/reader/aligned.py0000644000076500000240000001140612607224144017562 0ustar sbstaff00000000000000# Natural Language Toolkit: Aligned Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # URL: # Author: Steven Bird # For license information, see LICENSE.TXT from nltk import compat from nltk.tokenize import WhitespaceTokenizer, RegexpTokenizer from nltk.translate import AlignedSent, Alignment from nltk.corpus.reader.api import CorpusReader from nltk.corpus.reader.util import StreamBackedCorpusView, concat,\ read_alignedsent_block class AlignedCorpusReader(CorpusReader): """ Reader for corpora of word-aligned sentences. Tokens are assumed to be separated by whitespace. Sentences begin on separate lines. """ def __init__(self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), alignedsent_block_reader=read_alignedsent_block, encoding='latin1'): """ Construct a new Aligned Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = AlignedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader def raw(self, fileids=None): """ :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat([AlignedSentCorpusView(fileid, enc, False, False, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)]) def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ return concat([AlignedSentCorpusView(fileid, enc, False, True, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)]) def aligned_sents(self, fileids=None): """ :return: the given file(s) as a list of AlignedSent objects. :rtype: list(AlignedSent) """ return concat([AlignedSentCorpusView(fileid, enc, True, True, self._word_tokenizer, self._sent_tokenizer, self._alignedsent_block_reader) for (fileid, enc) in self.abspaths(fileids, True)]) class AlignedSentCorpusView(StreamBackedCorpusView): """ A specialized corpus view for aligned sentences. ``AlignedSentCorpusView`` objects are typically created by ``AlignedCorpusReader`` (not directly by nltk users). """ def __init__(self, corpus_file, encoding, aligned, group_by_sent, word_tokenizer, sent_tokenizer, alignedsent_block_reader): self._aligned = aligned self._group_by_sent = group_by_sent self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._alignedsent_block_reader = alignedsent_block_reader StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): block = [self._word_tokenizer.tokenize(sent_str) for alignedsent_str in self._alignedsent_block_reader(stream) for sent_str in self._sent_tokenizer.tokenize(alignedsent_str)] if self._aligned: block[2] = Alignment.fromstring(" ".join(block[2])) # kludge; we shouldn't have tokenized the alignment string block = [AlignedSent(*block)] elif self._group_by_sent: block = [block[0]] else: block = block[0] return block nltk-3.1/nltk/corpus/reader/api.py0000644000076500000240000004265412607224144016741 0ustar sbstaff00000000000000# Natural Language Toolkit: API for Corpus Readers # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ API for corpus readers. """ from __future__ import unicode_literals import os import re from collections import defaultdict from nltk import compat from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer from nltk.corpus.reader.util import * @compat.python_2_unicode_compatible class CorpusReader(object): """ A base class for "corpus reader" classes, each of which can be used to read a specific corpus format. Each individual corpus reader instance is used to read a specific corpus, consisting of one or more files under a common root directory. Each file is identified by its ``file identifier``, which is the relative path to the file from the root directory. A separate subclass is be defined for each corpus format. These subclasses define one or more methods that provide 'views' on the corpus contents, such as ``words()`` (for a list of words) and ``parsed_sents()`` (for a list of parsed sentences). Called with no arguments, these methods will return the contents of the entire corpus. For most corpora, these methods define one or more selection arguments, such as ``fileids`` or ``categories``, which can be used to select which portion of the corpus should be returned. """ def __init__(self, root, fileids, encoding='utf8', tagset=None): """ :type root: PathPointer or str :param root: A path pointer identifying the root directory for this corpus. If a string is specified, then it will be converted to a ``PathPointer`` automatically. :param fileids: A list of the files that make up this corpus. This list can either be specified explicitly, as a list of strings; or implicitly, as a regular expression over file paths. The absolute path for each file will be constructed by joining the reader's root to each file name. :param encoding: The default unicode encoding for the files that make up the corpus. The value of ``encoding`` can be any of the following: - A string: ``encoding`` is the encoding name for all files. - A dictionary: ``encoding[file_id]`` is the encoding name for the file whose identifier is ``file_id``. If ``file_id`` is not in ``encoding``, then the file contents will be processed using non-unicode byte strings. - A list: ``encoding`` should be a list of ``(regexp, encoding)`` tuples. The encoding for a file whose identifier is ``file_id`` will be the ``encoding`` value for the first tuple whose ``regexp`` matches the ``file_id``. If no tuple's ``regexp`` matches the ``file_id``, the file contents will be processed using non-unicode byte strings. - None: the file contents of all files will be processed using non-unicode byte strings. :param tagset: The name of the tagset used by this corpus, to be used for normalizing or converting the POS tags returned by the tagged_...() methods. """ # Convert the root to a path pointer, if necessary. if isinstance(root, compat.string_types) and not isinstance(root, PathPointer): m = re.match('(.*\.zip)/?(.*)$|', root) zipfile, zipentry = m.groups() if zipfile: root = ZipFilePathPointer(zipfile, zipentry) else: root = FileSystemPathPointer(root) elif not isinstance(root, PathPointer): raise TypeError('CorpusReader: expected a string or a PathPointer') # If `fileids` is a regexp, then expand it. if isinstance(fileids, compat.string_types): fileids = find_corpus_fileids(root, fileids) self._fileids = fileids """A list of the relative paths for the fileids that make up this corpus.""" self._root = root """The root directory for this corpus.""" # If encoding was specified as a list of regexps, then convert # it to a dictionary. if isinstance(encoding, list): encoding_dict = {} for fileid in self._fileids: for x in encoding: (regexp, enc) = x if re.match(regexp, fileid): encoding_dict[fileid] = enc break encoding = encoding_dict self._encoding = encoding """The default unicode encoding for the fileids that make up this corpus. If ``encoding`` is None, then the file contents are processed using byte strings.""" self._tagset = tagset def __repr__(self): if isinstance(self._root, ZipFilePathPointer): path = '%s/%s' % (self._root.zipfile.filename, self._root.entry) else: path = '%s' % self._root.path return '<%s in %r>' % (self.__class__.__name__, path) def ensure_loaded(self): """ Load this corpus (if it has not already been loaded). This is used by LazyCorpusLoader as a simple method that can be used to make sure a corpus is loaded -- e.g., in case a user wants to do help(some_corpus). """ pass # no need to actually do anything. def readme(self): """ Return the contents of the corpus README file, if it exists. """ return self.open("README").read() def license(self): """ Return the contents of the corpus LICENSE file, if it exists. """ return self.open("LICENSE").read() def citation(self): """ Return the contents of the corpus citation.bib file, if it exists. """ return self.open("citation.bib").read() def fileids(self): """ Return a list of file identifiers for the fileids that make up this corpus. """ return self._fileids def abspath(self, fileid): """ Return the absolute path for the given file. :type fileid: str :param fileid: The file identifier for the file whose path should be returned. :rtype: PathPointer """ return self._root.join(fileid) def abspaths(self, fileids=None, include_encoding=False, include_fileid=False): """ Return a list of the absolute paths for all fileids in this corpus; or for the given list of fileids, if specified. :type fileids: None or str or list :param fileids: Specifies the set of fileids for which paths should be returned. Can be None, for all fileids; a list of file identifiers, for a specified set of fileids; or a single file identifier, for a single file. Note that the return value is always a list of paths, even if ``fileids`` is a single file identifier. :param include_encoding: If true, then return a list of ``(path_pointer, encoding)`` tuples. :rtype: list(PathPointer) """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] paths = [self._root.join(f) for f in fileids] if include_encoding and include_fileid: return list(zip(paths, [self.encoding(f) for f in fileids], fileids)) elif include_fileid: return list(zip(paths, fileids)) elif include_encoding: return list(zip(paths, [self.encoding(f) for f in fileids])) else: return paths def open(self, file): """ Return an open stream that can be used to read the given file. If the file's encoding is not None, then the stream will automatically decode the file's contents into unicode. :param file: The file identifier of the file to read. """ encoding = self.encoding(file) stream = self._root.join(file).open(encoding) return stream def encoding(self, file): """ Return the unicode encoding for the given corpus file, if known. If the encoding is unknown, or if the given file should be processed using byte strings (str), then return None. """ if isinstance(self._encoding, dict): return self._encoding.get(file) else: return self._encoding def _get_root(self): return self._root root = property(_get_root, doc=""" The directory where this corpus is stored. :type: PathPointer""") ###################################################################### #{ Corpora containing categorized items ###################################################################### class CategorizedCorpusReader(object): """ A mixin class used to aid in the implementation of corpus readers for categorized corpora. This class defines the method ``categories()``, which returns a list of the categories for the corpus or for a specified set of fileids; and overrides ``fileids()`` to take a ``categories`` argument, restricting the set of fileids to be returned. Subclasses are expected to: - Call ``__init__()`` to set up the mapping. - Override all view methods to accept a ``categories`` parameter, which can be used *instead* of the ``fileids`` parameter, to select which fileids should be included in the returned view. """ def __init__(self, kwargs): """ Initialize this mapping based on keyword arguments, as follows: - cat_pattern: A regular expression pattern used to find the category for each file identifier. The pattern will be applied to each file identifier, and the first matching group will be used as the category label for that file. - cat_map: A dictionary, mapping from file identifiers to category labels. - cat_file: The name of a file that contains the mapping from file identifiers to categories. The argument ``cat_delimiter`` can be used to specify a delimiter. The corresponding argument will be deleted from ``kwargs``. If more than one argument is specified, an exception will be raised. """ self._f2c = None #: file-to-category mapping self._c2f = None #: category-to-file mapping self._pattern = None #: regexp specifying the mapping self._map = None #: dict specifying the mapping self._file = None #: fileid of file containing the mapping self._delimiter = None #: delimiter for ``self._file`` if 'cat_pattern' in kwargs: self._pattern = kwargs['cat_pattern'] del kwargs['cat_pattern'] elif 'cat_map' in kwargs: self._map = kwargs['cat_map'] del kwargs['cat_map'] elif 'cat_file' in kwargs: self._file = kwargs['cat_file'] del kwargs['cat_file'] if 'cat_delimiter' in kwargs: self._delimiter = kwargs['cat_delimiter'] del kwargs['cat_delimiter'] else: raise ValueError('Expected keyword argument cat_pattern or ' 'cat_map or cat_file.') if ('cat_pattern' in kwargs or 'cat_map' in kwargs or 'cat_file' in kwargs): raise ValueError('Specify exactly one of: cat_pattern, ' 'cat_map, cat_file.') def _init(self): self._f2c = defaultdict(set) self._c2f = defaultdict(set) if self._pattern is not None: for file_id in self._fileids: category = re.match(self._pattern, file_id).group(1) self._add(file_id, category) elif self._map is not None: for (file_id, categories) in self._map.items(): for category in categories: self._add(file_id, category) elif self._file is not None: for line in self.open(self._file).readlines(): line = line.strip() file_id, categories = line.split(self._delimiter, 1) if file_id not in self.fileids(): raise ValueError('In category mapping file %s: %s ' 'not found' % (self._file, file_id)) for category in categories.split(self._delimiter): self._add(file_id, category) def _add(self, file_id, category): self._f2c[file_id].add(category) self._c2f[category].add(file_id) def categories(self, fileids=None): """ Return a list of the categories that are defined for this corpus, or for the file(s) if it is given. """ if self._f2c is None: self._init() if fileids is None: return sorted(self._c2f) if isinstance(fileids, compat.string_types): fileids = [fileids] return sorted(set.union(*[self._f2c[d] for d in fileids])) def fileids(self, categories=None): """ Return a list of file identifiers for the files that make up this corpus, or that make up the given category(s) if specified. """ if categories is None: return super(CategorizedCorpusReader, self).fileids() elif isinstance(categories, compat.string_types): if self._f2c is None: self._init() if categories in self._c2f: return sorted(self._c2f[categories]) else: raise ValueError('Category %s not found' % categories) else: if self._f2c is None: self._init() return sorted(set.union(*[self._c2f[c] for c in categories])) ###################################################################### #{ Treebank readers ###################################################################### #[xx] is it worth it to factor this out? class SyntaxCorpusReader(CorpusReader): """ An abstract base class for reading corpora consisting of syntactically parsed text. Subclasses should define: - ``__init__``, which specifies the location of the corpus and a method for detecting the sentence blocks in corpus files. - ``_read_block``, which reads a block from the input stream. - ``_word``, which takes a block and returns a list of list of words. - ``_tag``, which takes a block and returns a list of list of tagged words. - ``_parse``, which takes a block and returns a list of parsed sentences. """ def _parse(self, s): raise NotImplementedError() def _word(self, s): raise NotImplementedError() def _tag(self, s): raise NotImplementedError() def _read_block(self, stream): raise NotImplementedError() def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def parsed_sents(self, fileids=None): reader = self._read_parsed_sent_block return concat([StreamBackedCorpusView(fileid, reader, encoding=enc) for fileid, enc in self.abspaths(fileids, True)]) def tagged_sents(self, fileids=None, tagset=None): def reader(stream): return self._read_tagged_sent_block(stream, tagset) return concat([StreamBackedCorpusView(fileid, reader, encoding=enc) for fileid, enc in self.abspaths(fileids, True)]) def sents(self, fileids=None): reader = self._read_sent_block return concat([StreamBackedCorpusView(fileid, reader, encoding=enc) for fileid, enc in self.abspaths(fileids, True)]) def tagged_words(self, fileids=None, tagset=None): def reader(stream): return self._read_tagged_word_block(stream, tagset) return concat([StreamBackedCorpusView(fileid, reader, encoding=enc) for fileid, enc in self.abspaths(fileids, True)]) def words(self, fileids=None): return concat([StreamBackedCorpusView(fileid, self._read_word_block, encoding=enc) for fileid, enc in self.abspaths(fileids, True)]) #------------------------------------------------------------ #{ Block Readers def _read_word_block(self, stream): return sum(self._read_sent_block(stream), []) def _read_tagged_word_block(self, stream, tagset=None): return sum(self._read_tagged_sent_block(stream, tagset), []) def _read_sent_block(self, stream): return list(filter(None, [self._word(t) for t in self._read_block(stream)])) def _read_tagged_sent_block(self, stream, tagset=None): return list(filter(None, [self._tag(t, tagset) for t in self._read_block(stream)])) def _read_parsed_sent_block(self, stream): return list(filter(None, [self._parse(t) for t in self._read_block(stream)])) #} End of Block Readers #------------------------------------------------------------ nltk-3.1/nltk/corpus/reader/bnc.py0000644000076500000240000002217312607224144016724 0ustar sbstaff00000000000000# Natural Language Toolkit: Plaintext Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """Corpus reader for the XML version of the British National Corpus.""" from nltk.corpus.reader.util import concat from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView, ElementTree class BNCCorpusReader(XMLCorpusReader): """Corpus reader for the XML version of the British National Corpus. For access to the complete XML data structure, use the ``xml()`` method. For access to simple word lists and tagged word lists, use ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. You can obtain the full version of the BNC corpus at http://www.ota.ox.ac.uk/desc/2554 If you extracted the archive to a directory called `BNC`, then you can instantiate the reader as:: BNCCorpusReader(root='BNC/Texts/', fileids=r'[A-K]/\w*/\w*\.xml') """ def __init__(self, root, fileids, lazy=True): XMLCorpusReader.__init__(self, root, fileids) self._lazy = lazy def words(self, fileids=None, strip_space=True, stem=False): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param stem: If true, then use word stems instead of word strings. """ return self._views(fileids, False, None, strip_space, stem) def tagged_words(self, fileids=None, c5=False, strip_space=True, stem=False): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) :param c5: If true, then the tags used will be the more detailed c5 tags. Otherwise, the simplified tags will be used. :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param stem: If true, then use word stems instead of word strings. """ tag = 'c5' if c5 else 'pos' return self._views(fileids, False, tag, strip_space, stem) def sents(self, fileids=None, strip_space=True, stem=False): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param stem: If true, then use word stems instead of word strings. """ return self._views(fileids, True, None, strip_space, stem) def tagged_sents(self, fileids=None, c5=False, strip_space=True, stem=False): """ :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples. :rtype: list(list(tuple(str,str))) :param c5: If true, then the tags used will be the more detailed c5 tags. Otherwise, the simplified tags will be used. :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param stem: If true, then use word stems instead of word strings. """ tag = 'c5' if c5 else 'pos' return self._views(fileids, sent=True, tag=tag, strip_space=strip_space, stem=stem) def _views(self, fileids=None, sent=False, tag=False, strip_space=True, stem=False): """A helper function that instantiates BNCWordViews or the list of words/sentences.""" f = BNCWordView if self._lazy else self._words return concat([f(fileid, sent, tag, strip_space, stem) for fileid in self.abspaths(fileids)]) def _words(self, fileid, bracket_sent, tag, strip_space, stem): """ Helper used to implement the view methods -- returns a list of words or a list of sentences, optionally tagged. :param fileid: The name of the underlying file. :param bracket_sent: If true, include sentence bracketing. :param tag: The name of the tagset to use, or None for no tags. :param strip_space: If true, strip spaces from word tokens. :param stem: If true, then substitute stems for words. """ result = [] xmldoc = ElementTree.parse(fileid).getroot() for xmlsent in xmldoc.findall('.//s'): sent = [] for xmlword in _all_xmlwords_in(xmlsent): word = xmlword.text if not word: word = "" # fixes issue 337? if strip_space or stem: word = word.strip() if stem: word = xmlword.get('hw', word) if tag == 'c5': word = (word, xmlword.get('c5')) elif tag == 'pos': word = (word, xmlword.get('pos', xmlword.get('c5'))) sent.append(word) if bracket_sent: result.append(BNCSentence(xmlsent.attrib['n'], sent)) else: result.extend(sent) assert None not in result return result def _all_xmlwords_in(elt, result=None): if result is None: result = [] for child in elt: if child.tag in ('c', 'w'): result.append(child) else: _all_xmlwords_in(child, result) return result class BNCSentence(list): """ A list of words, augmented by an attribute ``num`` used to record the sentence identifier (the ``n`` attribute from the XML). """ def __init__(self, num, items): self.num = num list.__init__(self, items) class BNCWordView(XMLCorpusView): """ A stream backed corpus view specialized for use with the BNC corpus. """ tags_to_ignore = set( ['pb', 'gap', 'vocal', 'event', 'unclear', 'shift', 'pause', 'align'] ) """These tags are ignored. For their description refer to the technical documentation, for example, http://www.natcorp.ox.ac.uk/docs/URG/ref-vocal.html """ def __init__(self, fileid, sent, tag, strip_space, stem): """ :param fileid: The name of the underlying file. :param sent: If true, include sentence bracketing. :param tag: The name of the tagset to use, or None for no tags. :param strip_space: If true, strip spaces from word tokens. :param stem: If true, then substitute stems for words. """ if sent: tagspec = '.*/s' else: tagspec = '.*/s/(.*/)?(c|w)' self._sent = sent self._tag = tag self._strip_space = strip_space self._stem = stem self.title = None #: Title of the document. self.author = None #: Author of the document. self.editor = None #: Editor self.resps = None #: Statement of responsibility XMLCorpusView.__init__(self, fileid, tagspec) # Read in a tasty header. self._open() self.read_block(self._stream, '.*/teiHeader$', self.handle_header) self.close() # Reset tag context. self._tag_context = {0: ()} def handle_header(self, elt, context): # Set up some metadata! titles = elt.findall('titleStmt/title') if titles: self.title = '\n'.join(title.text.strip() for title in titles) authors = elt.findall('titleStmt/author') if authors: self.author = '\n'.join(author.text.strip() for author in authors) editors = elt.findall('titleStmt/editor') if editors: self.editor = '\n'.join(editor.text.strip() for editor in editors) resps = elt.findall('titleStmt/respStmt') if resps: self.resps = '\n\n'.join( '\n'.join( resp_elt.text.strip() for resp_elt in resp ) for resp in resps ) def handle_elt(self, elt, context): if self._sent: return self.handle_sent(elt) else: return self.handle_word(elt) def handle_word(self, elt): word = elt.text if not word: word = "" # fixes issue 337? if self._strip_space or self._stem: word = word.strip() if self._stem: word = elt.get('hw', word) if self._tag == 'c5': word = (word, elt.get('c5')) elif self._tag == 'pos': word = (word, elt.get('pos', elt.get('c5'))) return word def handle_sent(self, elt): sent = [] for child in elt: if child.tag in ('mw', 'hi', 'corr', 'trunc'): sent += [self.handle_word(w) for w in child] elif child.tag in ('w', 'c'): sent.append(self.handle_word(child)) elif child.tag not in self.tags_to_ignore: raise ValueError('Unexpected element %s' % child.tag) return BNCSentence(elt.attrib['n'], sent) nltk-3.1/nltk/corpus/reader/bracket_parse.py0000644000076500000240000002403512607224144020766 0ustar sbstaff00000000000000# Natural Language Toolkit: Penn Treebank Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Corpus reader for corpora that consist of parenthesis-delineated parse trees. """ import sys from nltk.tree import Tree from nltk.tag import map_tag from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * # we use [^\s()]+ instead of \S+? to avoid matching () SORTTAGWRD = re.compile(r'\((\d+) ([^\s()]+) ([^\s()]+)\)') TAGWORD = re.compile(r'\(([^\s()]+) ([^\s()]+)\)') WORD = re.compile(r'\([^\s()]+ ([^\s()]+)\)') EMPTY_BRACKETS = re.compile(r'\s*\(\s*\(') class BracketParseCorpusReader(SyntaxCorpusReader): """ Reader for corpora that consist of parenthesis-delineated parse trees, like those found in the "combined" section of the Penn Treebank, e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))". """ def __init__(self, root, fileids, comment_char=None, detect_blocks='unindented_paren', encoding='utf8', tagset=None): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param comment_char: The character which can appear at the start of a line to indicate that the rest of the line is a comment. :param detect_blocks: The method that is used to find blocks in the corpus; can be 'unindented_paren' (every unindented parenthesis starts a new parse) or 'sexpr' (brackets are matched). :param tagset: The name of the tagset used by this corpus, to be used for normalizing or converting the POS tags returned by the tagged_...() methods. """ CorpusReader.__init__(self, root, fileids, encoding) self._comment_char = comment_char self._detect_blocks = detect_blocks self._tagset = tagset def _read_block(self, stream): if self._detect_blocks == 'sexpr': return read_sexpr_block(stream, comment_char=self._comment_char) elif self._detect_blocks == 'blankline': return read_blankline_block(stream) elif self._detect_blocks == 'unindented_paren': # Tokens start with unindented left parens. toks = read_regexp_block(stream, start_re=r'^\(') # Strip any comments out of the tokens. if self._comment_char: toks = [re.sub('(?m)^%s.*'%re.escape(self._comment_char), '', tok) for tok in toks] return toks else: assert 0, 'bad block type' def _normalize(self, t): # If there's an empty set of brackets surrounding the actual # parse, then strip them off. if EMPTY_BRACKETS.match(t): t = t.strip()[1:-1] # Replace leaves of the form (!), (,), with (! !), (, ,) t = re.sub(r"\((.)\)", r"(\1 \1)", t) # Replace leaves of the form (tag word root) with (tag word) t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t) return t def _parse(self, t): try: return Tree.fromstring(self._normalize(t)) except ValueError as e: sys.stderr.write("Bad tree detected; trying to recover...\n") # Try to recover, if we can: if e.args == ('mismatched parens',): for n in range(1, 5): try: v = Tree(self._normalize(t+')'*n)) sys.stderr.write(" Recovered by adding %d close " "paren(s)\n" % n) return v except ValueError: pass # Try something else: sys.stderr.write(" Recovered by returning a flat parse.\n") #sys.stderr.write(' '.join(t.split())+'\n') return Tree('S', self._tag(t)) def _tag(self, t, tagset=None): tagged_sent = [(w,p) for (p,w) in TAGWORD.findall(self._normalize(t))] if tagset and tagset != self._tagset: tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (w,p) in tagged_sent] return tagged_sent def _word(self, t): return WORD.findall(self._normalize(t)) class CategorizedBracketParseCorpusReader(CategorizedCorpusReader, BracketParseCorpusReader): """ A reader for parsed corpora whose documents are divided into categories based on their file identifiers. @author: Nathan Schneider """ def __init__(self, *args, **kwargs): """ Initialize the corpus reader. Categorization arguments (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to the L{CategorizedCorpusReader constructor }. The remaining arguments are passed to the L{BracketParseCorpusReader constructor }. """ CategorizedCorpusReader.__init__(self, kwargs) BracketParseCorpusReader.__init__(self, *args, **kwargs) def _resolve(self, fileids, categories): if fileids is not None and categories is not None: raise ValueError('Specify fileids or categories, not both') if categories is not None: return self.fileids(categories) else: return fileids def raw(self, fileids=None, categories=None): return BracketParseCorpusReader.raw( self, self._resolve(fileids, categories)) def words(self, fileids=None, categories=None): return BracketParseCorpusReader.words( self, self._resolve(fileids, categories)) def sents(self, fileids=None, categories=None): return BracketParseCorpusReader.sents( self, self._resolve(fileids, categories)) def paras(self, fileids=None, categories=None): return BracketParseCorpusReader.paras( self, self._resolve(fileids, categories)) def tagged_words(self, fileids=None, categories=None, tagset=None): return BracketParseCorpusReader.tagged_words( self, self._resolve(fileids, categories), tagset) def tagged_sents(self, fileids=None, categories=None, tagset=None): return BracketParseCorpusReader.tagged_sents( self, self._resolve(fileids, categories), tagset) def tagged_paras(self, fileids=None, categories=None, tagset=None): return BracketParseCorpusReader.tagged_paras( self, self._resolve(fileids, categories), tagset) def parsed_words(self, fileids=None, categories=None): return BracketParseCorpusReader.parsed_words( self, self._resolve(fileids, categories)) def parsed_sents(self, fileids=None, categories=None): return BracketParseCorpusReader.parsed_sents( self, self._resolve(fileids, categories)) def parsed_paras(self, fileids=None, categories=None): return BracketParseCorpusReader.parsed_paras( self, self._resolve(fileids, categories)) class AlpinoCorpusReader(BracketParseCorpusReader): """ Reader for the Alpino Dutch Treebank. This corpus has a lexical breakdown structure embedded, as read by _parse Unfortunately this puts punctuation and some other words out of the sentence order in the xml element tree. This is no good for tag_ and word_ _tag and _word will be overridden to use a non-default new parameter 'ordered' to the overridden _normalize function. The _parse function can then remain untouched. """ def __init__(self, root, encoding='ISO-8859-1', tagset=None): BracketParseCorpusReader.__init__(self, root, 'alpino\.xml', detect_blocks='blankline', encoding=encoding, tagset=tagset) def _normalize(self, t, ordered = False): """Normalize the xml sentence element in t. The sentence elements , although embedded in a few overall xml elements, are seperated by blank lines. That's how the reader can deliver them one at a time. Each sentence has a few category subnodes that are of no use to us. The remaining word nodes may or may not appear in the proper order. Each word node has attributes, among which: - begin : the position of the word in the sentence - pos : Part of Speech: the Tag - word : the actual word The return value is a string with all xml elementes replaced by clauses: either a cat clause with nested clauses, or a word clause. The order of the bracket clauses closely follows the xml. If ordered == True, the word clauses include an order sequence number. If ordered == False, the word clauses only have pos and word parts. """ if t[:10] != "', r"(\1", t) if ordered: t = re.sub(r' ', r"(\1 \2 \3)", t) else: t = re.sub(r' ', r"(\1 \2)", t) t = re.sub(r" ", r")", t) t = re.sub(r".*", r"", t) t = re.sub(r"", r"", t) return t def _tag(self, t, tagset=None): tagged_sent = [(int(o), w, p) for (o,p,w) in SORTTAGWRD.findall(self._normalize(t, ordered = True))] tagged_sent.sort() if tagset and tagset != self._tagset: tagged_sent = [(w, map_tag(self._tagset, tagset, p)) for (o,w,p) in tagged_sent] else: tagged_sent = [(w,p) for (o,w,p) in tagged_sent] return tagged_sent def _word(self, t): """Return a correctly ordered list if words""" tagged_sent = self._tag(t) return [w for (w,p) in tagged_sent] nltk-3.1/nltk/corpus/reader/categorized_sents.py0000644000076500000240000001532412607224144021676 0ustar sbstaff00000000000000# Natural Language Toolkit: Categorized Sentences Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader structured for corpora that contain one instance on each row. This CorpusReader is specifically used for the Subjectivity Dataset and the Sentence Polarity Dataset. - Subjectivity Dataset information - Authors: Bo Pang and Lillian Lee. Url: http://www.cs.cornell.edu/people/pabo/movie-review-data Distributed with permission. Related papers: - Bo Pang and Lillian Lee. "A Sentimental Education: Sentiment Analysis Using Subjectivity Summarization Based on Minimum Cuts". Proceedings of the ACL, 2004. - Sentence Polarity Dataset information - Authors: Bo Pang and Lillian Lee. Url: http://www.cs.cornell.edu/people/pabo/movie-review-data Related papers: - Bo Pang and Lillian Lee. "Seeing stars: Exploiting class relationships for sentiment categorization with respect to rating scales". Proceedings of the ACL, 2005. """ from nltk.corpus.reader.api import * from nltk.tokenize import * class CategorizedSentencesCorpusReader(CategorizedCorpusReader, CorpusReader): """ A reader for corpora in which each row represents a single instance, mainly a sentence. Istances are divided into categories based on their file identifiers (see CategorizedCorpusReader). Since many corpora allow rows that contain more than one sentence, it is possible to specify a sentence tokenizer to retrieve all sentences instead than all rows. Examples using the Subjectivity Dataset: >>> from nltk.corpus import subjectivity >>> subjectivity.sents()[23] ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits', 'happened', 'off', 'screen', '.'] >>> subjectivity.categories() ['obj', 'subj'] >>> subjectivity.words(categories='subj') ['smart', 'and', 'alert', ',', 'thirteen', ...] Examples using the Sentence Polarity Dataset: >>> from nltk.corpus import sentence_polarity >>> sentence_polarity.sents() [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.'], ...] >>> sentence_polarity.categories() ['neg', 'pos'] """ CorpusView = StreamBackedCorpusView def __init__(self, root, fileids, word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=None, encoding='utf8', **kwargs): """ :param root: The root directory for the corpus. :param fileids: a list or regexp specifying the fileids in the corpus. :param word_tokenizer: a tokenizer for breaking sentences or paragraphs into words. Default: `WhitespaceTokenizer` :param sent_tokenizer: a tokenizer for breaking paragraphs into sentences. :param encoding: the encoding that should be used to read the corpus. :param kwargs: additional parameters passed to CategorizedCorpusReader. """ CorpusReader.__init__(self, root, fileids, encoding) CategorizedCorpusReader.__init__(self, kwargs) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer def _resolve(self, fileids, categories): if fileids is not None and categories is not None: raise ValueError('Specify fileids or categories, not both') if categories is not None: return self.fileids(categories) else: return fileids def raw(self, fileids=None, categories=None): """ :param fileids: a list or regexp specifying the fileids that have to be returned as a raw string. :param categories: a list specifying the categories whose files have to be returned as a raw string. :return: the given file(s) as a single string. :rtype: str """ fileids = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def readme(self): """ Return the contents of the corpus Readme.txt file. """ return self.open("README").read() def sents(self, fileids=None, categories=None): """ Return all sentences in the corpus or in the specified file(s). :param fileids: a list or regexp specifying the ids of the files whose sentences have to be returned. :param categories: a list specifying the categories whose sentences have to be returned. :return: the given file(s) as a list of sentences. Each sentence is tokenized using the specified word_tokenizer. :rtype: list(list(str)) """ fileids = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def words(self, fileids=None, categories=None): """ Return all words and punctuation symbols in the corpus or in the specified file(s). :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :param categories: a list specifying the categories whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ fileids = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def _read_sent_block(self, stream): sents = [] for i in range(20): # Read 20 lines at a time. line = stream.readline() if not line: continue if self._sent_tokenizer: sents.extend([self._word_tokenizer.tokenize(sent) for sent in self._sent_tokenizer.tokenize(line)]) else: sents.append(self._word_tokenizer.tokenize(line)) return sents def _read_word_block(self, stream): words = [] for sent in self._read_sent_block(stream): words.extend(sent) return words nltk-3.1/nltk/corpus/reader/chasen.py0000644000076500000240000001120412607224144017414 0ustar sbstaff00000000000000# # Copyright (C) 2001-2015 NLTK Project # Author: Masato Hagiwara # URL: # For license information, see LICENSE.TXT # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html from __future__ import print_function import sys from nltk.corpus.reader import util from nltk import compat from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class ChasenCorpusReader(CorpusReader): def __init__(self, root, fileids, encoding='utf8', sent_splitter=None): self._sent_splitter = sent_splitter CorpusReader.__init__(self, root, fileids, encoding) def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def words(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, False, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_words(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, True, False, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def sents(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, False, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_sents(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, True, True, False, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def paras(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, False, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_paras(self, fileids=None): return concat([ChasenCorpusView(fileid, enc, True, True, True, self._sent_splitter) for (fileid, enc) in self.abspaths(fileids, True)]) class ChasenCorpusView(StreamBackedCorpusView): """ A specialized corpus view for ChasenReader. Similar to ``TaggedCorpusView``, but this'll use fixed sets of word and sentence tokenizer. """ def __init__(self, corpus_file, encoding, tagged, group_by_sent, group_by_para, sent_splitter=None): self._tagged = tagged self._group_by_sent = group_by_sent self._group_by_para = group_by_para self._sent_splitter = sent_splitter StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): """Reads one paragraph at a time.""" block = [] for para_str in read_regexp_block(stream, r".", r"^EOS\n"): para = [] sent = [] for line in para_str.splitlines(): _eos = line.strip() == 'EOS' _cells = line.split('\t') w = (_cells[0], '\t'.join(_cells[1:])) if not _eos: sent.append(w) if _eos or (self._sent_splitter and self._sent_splitter(w)): if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) sent = [] if len(sent)>0: if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: block.append(para) else: block.extend(para) return block def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') print('/'.join( jeita.words()[22100:22140] )) print('\nEOS\n'.join('\n'.join("%s/%s" % (w[0],w[1].split('\t')[2]) for w in sent) for sent in jeita.tagged_sents()[2170:2173])) def test(): from nltk.corpus.util import LazyCorpusLoader jeita = LazyCorpusLoader( 'jeita', ChasenCorpusReader, r'.*chasen', encoding='utf-8') assert isinstance(jeita.tagged_words()[0][1], compat.string_types) if __name__ == '__main__': demo() test() nltk-3.1/nltk/corpus/reader/childes.py0000644000076500000240000005345212607224144017601 0ustar sbstaff00000000000000# CHILDES XML Corpus Reader # Copyright (C) 2001-2015 NLTK Project # Author: Tomonori Nagano # Alexis Dimitriadis # URL: # For license information, see LICENSE.TXT """ Corpus reader for the XML version of the CHILDES corpus. """ from __future__ import print_function __docformat__ = 'epytext en' import re from collections import defaultdict from nltk.util import flatten from nltk.compat import string_types from nltk.corpus.reader.util import concat from nltk.corpus.reader.xmldocs import XMLCorpusReader, ElementTree # to resolve the namespace issue NS = 'http://www.talkbank.org/ns/talkbank' class CHILDESCorpusReader(XMLCorpusReader): """ Corpus reader for the XML version of the CHILDES corpus. The CHILDES corpus is available at ``http://childes.psy.cmu.edu/``. The XML version of CHILDES is located at ``http://childes.psy.cmu.edu/data-xml/``. Copy the needed parts of the CHILDES XML corpus into the NLTK data directory (``nltk_data/corpora/CHILDES/``). For access to the file text use the usual nltk functions, ``words()``, ``sents()``, ``tagged_words()`` and ``tagged_sents()``. """ def __init__(self, root, fileids, lazy=True): XMLCorpusReader.__init__(self, root, fileids) self._lazy = lazy def words(self, fileids=None, speaker='ALL', stem=False, relation=False, strip_space=True, replace=False): """ :return: the given file(s) as a list of words :rtype: list(str) :param speaker: If specified, select specific speaker(s) defined in the corpus. Default is 'ALL' (all participants). Common choices are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of (stem, index, dependent_index) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced (intended) word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ sent=None pos=False return concat([self._get_words(fileid, speaker, sent, stem, relation, pos, strip_space, replace) for fileid in self.abspaths(fileids)]) def tagged_words(self, fileids=None, speaker='ALL', stem=False, relation=False, strip_space=True, replace=False): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) :param speaker: If specified, select specific speaker(s) defined in the corpus. Default is 'ALL' (all participants). Common choices are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of (stem, index, dependent_index) :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced (intended) word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ sent=None pos=True return concat([self._get_words(fileid, speaker, sent, stem, relation, pos, strip_space, replace) for fileid in self.abspaths(fileids)]) def sents(self, fileids=None, speaker='ALL', stem=False, relation=None, strip_space=True, replace=False): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) :param speaker: If specified, select specific speaker(s) defined in the corpus. Default is 'ALL' (all participants). Common choices are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of ``(str,pos,relation_list)``. If there is manually-annotated relation info, it will return tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced (intended) word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ sent=True pos=False return concat([self._get_words(fileid, speaker, sent, stem, relation, pos, strip_space, replace) for fileid in self.abspaths(fileids)]) def tagged_sents(self, fileids=None, speaker='ALL', stem=False, relation=None, strip_space=True, replace=False): """ :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples. :rtype: list(list(tuple(str,str))) :param speaker: If specified, select specific speaker(s) defined in the corpus. Default is 'ALL' (all participants). Common choices are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude researchers) :param stem: If true, then use word stems instead of word strings. :param relation: If true, then return tuples of ``(str,pos,relation_list)``. If there is manually-annotated relation info, it will return tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)`` :param strip_space: If true, then strip trailing spaces from word tokens. Otherwise, leave the spaces on the tokens. :param replace: If true, then use the replaced (intended) word instead of the original word (e.g., 'wat' will be replaced with 'watch') """ sent=True pos=True return concat([self._get_words(fileid, speaker, sent, stem, relation, pos, strip_space, replace) for fileid in self.abspaths(fileids)]) def corpus(self, fileids=None): """ :return: the given file(s) as a dict of ``(corpus_property_key, value)`` :rtype: list(dict) """ return [self._get_corpus(fileid) for fileid in self.abspaths(fileids)] def _get_corpus(self, fileid): results = dict() xmldoc = ElementTree.parse(fileid).getroot() for key, value in xmldoc.items(): results[key] = value return results def participants(self, fileids=None): """ :return: the given file(s) as a dict of ``(participant_property_key, value)`` :rtype: list(dict) """ return [self._get_participants(fileid) for fileid in self.abspaths(fileids)] def _get_participants(self, fileid): # multidimensional dicts def dictOfDicts(): return defaultdict(dictOfDicts) xmldoc = ElementTree.parse(fileid).getroot() # getting participants' data pat = dictOfDicts() for participant in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS,NS)): for (key,value) in participant.items(): pat[participant.get('id')][key] = value return pat def age(self, fileids=None, speaker='CHI', month=False): """ :return: the given file(s) as string or int :rtype: list or int :param month: If true, return months instead of year-month-date """ return [self._get_age(fileid, speaker, month) for fileid in self.abspaths(fileids)] def _get_age(self, fileid, speaker, month): xmldoc = ElementTree.parse(fileid).getroot() for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS,NS)): try: if pat.get('id') == speaker: age = pat.get('age') if month: age = self.convert_age(age) return age # some files don't have age data except (TypeError, AttributeError) as e: return None def convert_age(self, age_year): "Caclculate age in months from a string in CHILDES format" m = re.match("P(\d+)Y(\d+)M?(\d?\d?)D?",age_year) age_month = int(m.group(1))*12 + int(m.group(2)) try: if int(m.group(3)) > 15: age_month += 1 # some corpora don't have age information? except ValueError as e: pass return age_month def MLU(self, fileids=None, speaker='CHI'): """ :return: the given file(s) as a floating number :rtype: list(float) """ return [self._getMLU(fileid, speaker=speaker) for fileid in self.abspaths(fileids)] def _getMLU(self, fileid, speaker): sents = self._get_words(fileid, speaker=speaker, sent=True, stem=True, relation=False, pos=True, strip_space=True, replace=True) results = [] lastSent = [] numFillers = 0 sentDiscount = 0 for sent in sents: posList = [pos for (word,pos) in sent] # if any part of the sentence is intelligible if any(pos == 'unk' for pos in posList): next # if the sentence is null elif sent == []: next # if the sentence is the same as the last sent elif sent == lastSent: next else: results.append([word for (word,pos) in sent]) # count number of fillers if len(set(['co',None]).intersection(posList)) > 0: numFillers += posList.count('co') numFillers += posList.count(None) sentDiscount += 1 lastSent = sent try: thisWordList = flatten(results) # count number of morphemes # (e.g., 'read' = 1 morpheme but 'read-PAST' is 2 morphemes) numWords = float(len(flatten([word.split('-') for word in thisWordList]))) - numFillers numSents = float(len(results)) - sentDiscount mlu = numWords/numSents except ZeroDivisionError: mlu = 0 # return {'mlu':mlu,'wordNum':numWords,'sentNum':numSents} return mlu def _get_words(self, fileid, speaker, sent, stem, relation, pos, strip_space, replace): if isinstance(speaker, string_types) and speaker != 'ALL': # ensure we have a list of speakers speaker = [ speaker ] xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): sents = [] # select speakers if speaker == 'ALL' or xmlsent.get('who') in speaker: for xmlword in xmlsent.findall('.//{%s}w' % NS): infl = None ; suffixStem = None; suffixTag = None # getting replaced words if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS,NS)): xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w' % (NS,NS,NS)) elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)): xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)) # get text if xmlword.text: word = xmlword.text else: word = '' # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find('.//{%s}stem' % NS) word = xmlstem.text except AttributeError as e: pass # if there is an inflection try: xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' % (NS,NS,NS)) word += '-' + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' % (NS,NS,NS,NS)) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" if suffixStem: word += "~"+suffixStem # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) xmlpos2 = xmlword.findall(".//{%s}s" % NS) if xmlpos2 != []: tag = xmlpos[0].text+":"+xmlpos2[0].text else: tag = xmlpos[0].text except (AttributeError,IndexError) as e: tag = "" try: xmlsuffixpos = xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}c' % (NS,NS,NS,NS,NS)) xmlsuffixpos2 = xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}pos/{%s}s' % (NS,NS,NS,NS,NS)) if xmlsuffixpos2: suffixTag = xmlsuffixpos[0].text+":"+xmlsuffixpos2[0].text else: suffixTag = xmlsuffixpos[0].text except: pass if suffixTag: tag += "~"+suffixTag word = (word, tag) # relational # the gold standard is stored in # if relation == True: for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra' % (NS,NS)): if not xmlstem_rel.get('type') == 'grt': word = (word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) else: word = (word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) try: for xmlpost_rel in xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}gra' % (NS,NS,NS)): if not xmlpost_rel.get('type') == 'grt': suffixStem = (suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) else: suffixStem = (suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) except: pass sents.append(word) if sent or relation: results.append(sents) else: results.extend(sents) return results # Ready-to-use browser opener """ The base URL for viewing files on the childes website. This shouldn't need to be changed, unless CHILDES changes the configuration of their server or unless the user sets up their own corpus webserver. """ childes_url_base = r'http://childes.psy.cmu.edu/browser/index.php?url=' def webview_file(self, fileid, urlbase=None): """Map a corpus file to its web version on the CHILDES website, and open it in a web browser. The complete URL to be used is: childes.childes_url_base + urlbase + fileid.replace('.xml', '.cha') If no urlbase is passed, we try to calculate it. This requires that the childes corpus was set up to mirror the folder hierarchy under childes.psy.cmu.edu/data-xml/, e.g.: nltk_data/corpora/childes/Eng-USA/Cornell/??? or nltk_data/corpora/childes/Romance/Spanish/Aguirre/??? The function first looks (as a special case) if "Eng-USA" is on the path consisting of +fileid; then if "childes", possibly followed by "data-xml", appears. If neither one is found, we use the unmodified fileid and hope for the best. If this is not right, specify urlbase explicitly, e.g., if the corpus root points to the Cornell folder, urlbase='Eng-USA/Cornell'. """ import webbrowser, re if urlbase: path = urlbase+"/"+fileid else: full = self.root + "/" + fileid full = re.sub(r'\\', '/', full) if '/childes/' in full.lower(): # Discard /data-xml/ if present path = re.findall(r'(?i)/childes(?:/data-xml)?/(.*)\.xml', full)[0] elif 'eng-usa' in full.lower(): path = 'Eng-USA/' + re.findall(r'/(?i)Eng-USA/(.*)\.xml', full)[0] else: path = fileid # Strip ".xml" and add ".cha", as necessary: if path.endswith('.xml'): path = path[:-4] if not path.endswith('.cha'): path = path+'.cha' url = self.childes_url_base + path webbrowser.open_new_tab(url) print("Opening in browser:", url) # Pausing is a good idea, but it's up to the user... # raw_input("Hit Return to continue") def demo(corpus_root=None): """ The CHILDES corpus should be manually downloaded and saved to ``[NLTK_Data_Dir]/corpora/childes/`` """ if not corpus_root: from nltk.data import find corpus_root = find('corpora/childes/data-xml/Eng-USA/') try: childes = CHILDESCorpusReader(corpus_root, '.*.xml') # describe all corpus for file in childes.fileids()[:5]: corpus = '' corpus_id = '' for (key,value) in childes.corpus(file)[0].items(): if key == "Corpus": corpus = value if key == "Id": corpus_id = value print('Reading', corpus,corpus_id,' .....') print("words:", childes.words(file)[:7],"...") print("words with replaced words:", childes.words(file, replace=True)[:7]," ...") print("words with pos tags:", childes.tagged_words(file)[:7]," ...") print("words (only MOT):", childes.words(file, speaker='MOT')[:7], "...") print("words (only CHI):", childes.words(file, speaker='CHI')[:7], "...") print("stemmed words:", childes.words(file, stem=True)[:7]," ...") print("words with relations and pos-tag:", childes.words(file, relation=True)[:5]," ...") print("sentence:", childes.sents(file)[:2]," ...") for (participant, values) in childes.participants(file)[0].items(): for (key, value) in values.items(): print("\tparticipant", participant, key, ":", value) print("num of sent:", len(childes.sents(file))) print("num of morphemes:", len(childes.words(file, stem=True))) print("age:", childes.age(file)) print("age in month:", childes.age(file, month=True)) print("MLU:", childes.MLU(file)) print() except LookupError as e: print("""The CHILDES corpus, or the parts you need, should be manually downloaded from http://childes.psy.cmu.edu/data-xml/ and saved at [NLTK_Data_Dir]/corpora/childes/ Alternately, you can call the demo with the path to a portion of the CHILDES corpus, e.g.: demo('/path/to/childes/data-xml/Eng-USA/") """) #corpus_root_http = urllib2.urlopen('http://childes.psy.cmu.edu/data-xml/Eng-USA/Bates.zip') #corpus_root_http_bates = zipfile.ZipFile(cStringIO.StringIO(corpus_root_http.read())) ##this fails #childes = CHILDESCorpusReader(corpus_root_http_bates,corpus_root_http_bates.namelist()) if __name__ == "__main__": demo() nltk-3.1/nltk/corpus/reader/chunked.py0000644000076500000240000002074512607224144017606 0ustar sbstaff00000000000000# Natural Language Toolkit: Chunked Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ A reader for corpora that contain chunked (and optionally tagged) documents. """ import os.path, codecs import nltk from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader from nltk import compat from nltk.tree import Tree from nltk.tokenize import * from nltk.chunk import tagstr2tree from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class ChunkedCorpusReader(CorpusReader): """ Reader for chunked (and optionally tagged) corpora. Paragraphs are split using a block reader. They are then tokenized into sentences using a sentence tokenizer. Finally, these sentences are parsed into chunk trees using a string-to-chunktree conversion function. Each of these steps can be performed using a default function or a custom function. By default, paragraphs are split on blank lines; sentences are listed one per line; and sentences are parsed into chunk trees using ``nltk.chunk.tagstr2tree``. """ def __init__(self, root, fileids, extension='', str2chunktree=tagstr2tree, sent_tokenizer=RegexpTokenizer('\n', gaps=True), para_block_reader=read_blankline_block, encoding='utf8', tagset=None): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset) """Arguments for corpus views generated by this corpus: a tuple (str2chunktree, sent_tokenizer, para_block_tokenizer)""" def raw(self, fileids=None): """ :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat([ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args) for (f, enc) in self.abspaths(fileids, True)]) def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ return concat([ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args) for (f, enc) in self.abspaths(fileids, True)]) def paras(self, fileids=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word strings. :rtype: list(list(list(str))) """ return concat([ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args) for (f, enc) in self.abspaths(fileids, True)]) def tagged_words(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) """ return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset) for (f, enc) in self.abspaths(fileids, True)]) def tagged_sents(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples. :rtype: list(list(tuple(str,str))) """ return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset) for (f, enc) in self.abspaths(fileids, True)]) def tagged_paras(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of ``(word,tag)`` tuples. :rtype: list(list(list(tuple(str,str)))) """ return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset) for (f, enc) in self.abspaths(fileids, True)]) def chunked_words(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of tagged words and chunks. Words are encoded as ``(word, tag)`` tuples (if the corpus has tags) or word strings (if the corpus has no tags). Chunks are encoded as depth-one trees over ``(word,tag)`` tuples or word strings. :rtype: list(tuple(str,str) and Tree) """ return concat([ChunkedCorpusView(f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset) for (f, enc) in self.abspaths(fileids, True)]) def chunked_sents(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of sentences, each encoded as a shallow Tree. The leaves of these trees are encoded as ``(word, tag)`` tuples (if the corpus has tags) or word strings (if the corpus has no tags). :rtype: list(Tree) """ return concat([ChunkedCorpusView(f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset) for (f, enc) in self.abspaths(fileids, True)]) def chunked_paras(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as a shallow Tree. The leaves of these trees are encoded as ``(word, tag)`` tuples (if the corpus has tags) or word strings (if the corpus has no tags). :rtype: list(list(Tree)) """ return concat([ChunkedCorpusView(f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset) for (f, enc) in self.abspaths(fileids, True)]) def _read_block(self, stream): return [tagstr2tree(t) for t in read_blankline_block(stream)] class ChunkedCorpusView(StreamBackedCorpusView): def __init__(self, fileid, encoding, tagged, group_by_sent, group_by_para, chunked, str2chunktree, sent_tokenizer, para_block_reader, source_tagset=None, target_tagset=None): StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) self._tagged = tagged self._group_by_sent = group_by_sent self._group_by_para = group_by_para self._chunked = chunked self._str2chunktree = str2chunktree self._sent_tokenizer = sent_tokenizer self._para_block_reader = para_block_reader self._source_tagset = source_tagset self._target_tagset = target_tagset def read_block(self, stream): block = [] for para_str in self._para_block_reader(stream): para = [] for sent_str in self._sent_tokenizer.tokenize(para_str): sent = self._str2chunktree(sent_str, source_tagset=self._source_tagset, target_tagset=self._target_tagset) # If requested, throw away the tags. if not self._tagged: sent = self._untag(sent) # If requested, throw away the chunks. if not self._chunked: sent = sent.leaves() # Add the sentence to `para`. if self._group_by_sent: para.append(sent) else: para.extend(sent) # Add the paragraph to `block`. if self._group_by_para: block.append(para) else: block.extend(para) # Return the block return block def _untag(self, tree): for i, child in enumerate(tree): if isinstance(child, Tree): self._untag(child) elif isinstance(child, tuple): tree[i] = child[0] else: raise ValueError('expected child to be Tree or tuple') return tree nltk-3.1/nltk/corpus/reader/cmudict.py0000644000076500000240000000676112607224144017617 0ustar sbstaff00000000000000# Natural Language Toolkit: Carnegie Mellon Pronouncing Dictionary Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ The Carnegie Mellon Pronouncing Dictionary [cmudict.0.6] ftp://ftp.cs.cmu.edu/project/speech/dict/ Copyright 1998 Carnegie Mellon University File Format: Each line consists of an uppercased word, a counter (for alternative pronunciations), and a transcription. Vowels are marked for stress (1=primary, 2=secondary, 0=no stress). E.g.: NATURAL 1 N AE1 CH ER0 AH0 L The dictionary contains 127069 entries. Of these, 119400 words are assigned a unique pronunciation, 6830 words have two pronunciations, and 839 words have three or more pronunciations. Many of these are fast-speech variants. Phonemes: There are 39 phonemes, as shown below: Phoneme Example Translation Phoneme Example Translation ------- ------- ----------- ------- ------- ----------- AA odd AA D AE at AE T AH hut HH AH T AO ought AO T AW cow K AW AY hide HH AY D B be B IY CH cheese CH IY Z D dee D IY DH thee DH IY EH Ed EH D ER hurt HH ER T EY ate EY T F fee F IY G green G R IY N HH he HH IY IH it IH T IY eat IY T JH gee JH IY K key K IY L lee L IY M me M IY N knee N IY NG ping P IH NG OW oat OW T OY toy T OY P pee P IY R read R IY D S sea S IY SH she SH IY T tea T IY TH theta TH EY T AH UH hood HH UH D UW two T UW V vee V IY W we W IY Y yield Y IY L D Z zee Z IY ZH seizure S IY ZH ER """ import codecs from nltk import compat from nltk.util import Index from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class CMUDictCorpusReader(CorpusReader): def entries(self): """ :return: the cmudict lexicon as a list of entries containing (word, transcriptions) tuples. """ return concat([StreamBackedCorpusView(fileid, read_cmudict_block, encoding=enc) for fileid, enc in self.abspaths(None, True)]) def raw(self): """ :return: the cmudict lexicon as a raw string. """ fileids = self._fileids if isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def words(self): """ :return: a list of all words defined in the cmudict lexicon. """ return [word.lower() for (word, _) in self.entries()] def dict(self): """ :return: the cmudict lexicon as a dictionary, whose keys are lowercase words and whose values are lists of pronunciations. """ return dict(Index(self.entries())) def read_cmudict_block(stream): entries = [] while len(entries) < 100: # Read 100 at a time. line = stream.readline() if line == '': return entries # end of file. pieces = line.split() entries.append( (pieces[0].lower(), pieces[2:]) ) return entries nltk-3.1/nltk/corpus/reader/comparative_sents.py0000644000076500000240000002677512607224144021724 0ustar sbstaff00000000000000# Natural Language Toolkit: Comparative Sentence Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader for the Comparative Sentence Dataset. - Comparative Sentence Dataset information - Annotated by: Nitin Jindal and Bing Liu, 2006. Department of Computer Sicence University of Illinois at Chicago Contact: Nitin Jindal, njindal@cs.uic.edu Bing Liu, liub@cs.uic.edu (http://www.cs.uic.edu/~liub) Distributed with permission. Related papers: - Nitin Jindal and Bing Liu. "Identifying Comparative Sentences in Text Documents". Proceedings of the ACM SIGIR International Conference on Information Retrieval (SIGIR-06), 2006. - Nitin Jindal and Bing Liu. "Mining Comprative Sentences and Relations". Proceedings of Twenty First National Conference on Artificial Intelligence (AAAI-2006), 2006. - Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences". Proceedings of the 22nd International Conference on Computational Linguistics (Coling-2008), Manchester, 18-22 August, 2008. """ import re from nltk.corpus.reader.api import * from nltk.tokenize import * # Regular expressions for dataset components STARS = re.compile(r'^\*+$') COMPARISON = re.compile(r'') CLOSE_COMPARISON = re.compile(r'') GRAD_COMPARISON = re.compile(r'') NON_GRAD_COMPARISON = re.compile(r'') ENTITIES_FEATS = re.compile(r"(\d)_((?:[\.\w\s/-](?!\d_))+)") KEYWORD = re.compile(r'\((?!.*\()(.*)\)$') class Comparison(object): """ A Comparison represents a comparative sentence and its constituents. """ def __init__(self, text=None, comp_type=None, entity_1=None, entity_2=None, feature=None, keyword=None): """ :param text: a string (optionally tokenized) containing a comparation. :param comp_type: an integer defining the type of comparison expressed. Values can be: 1 (Non-equal gradable), 2 (Equative), 3 (Superlative), 4 (Non-gradable). :param entity_1: the first entity considered in the comparison relation. :param entity_2: the second entity considered in the comparison relation. :param feature: the feature considered in the comparison relation. :param keyword: the word or phrase which is used for that comparative relation. """ self.text = text self.comp_type = comp_type self.entity_1 = entity_1 self.entity_2 = entity_2 self.feature = feature self.keyword = keyword def __repr__(self): return ("Comparison(text=\"{}\", comp_type={}, entity_1=\"{}\", entity_2=\"{}\", " "feature=\"{}\", keyword=\"{}\")").format(self.text, self.comp_type, self.entity_1, self.entity_2, self.feature, self.keyword) class ComparativeSentencesCorpusReader(CorpusReader): """ Reader for the Comparative Sentence Dataset by Jindal and Liu (2006). >>> from nltk.corpus import comparative_sentences >>> comparison = comparative_sentences.comparisons()[0] >>> comparison.text ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly', 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve", 'had', '.'] >>> comparison.entity_2 'models' >>> (comparison.feature, comparison.keyword) ('rewind', 'more') >>> len(comparative_sentences.comparisons()) 853 """ CorpusView = StreamBackedCorpusView def __init__(self, root, fileids, word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=None, encoding='utf8'): """ :param root: The root directory for this corpus. :param fileids: a list or regexp specifying the fileids in this corpus. :param word_tokenizer: tokenizer for breaking sentences or paragraphs into words. Default: `WhitespaceTokenizer` :param sent_tokenizer: tokenizer for breaking paragraphs into sentences. :param encoding: the encoding that should be used to read the corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer def comparisons(self, fileids=None): """ Return all comparisons in the corpus. :param fileids: a list or regexp specifying the ids of the files whose comparisons have to be returned. :return: the given file(s) as a list of Comparison objects. :rtype: list(Comparison) """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.CorpusView(path, self._read_comparison_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def keywords(self, fileids=None): """ Return a set of all keywords used in the corpus. :param fileids: a list or regexp specifying the ids of the files whose keywords have to be returned. :return: the set of keywords and comparative phrases used in the corpus. :rtype: set(str) """ all_keywords = concat([self.CorpusView(path, self._read_keyword_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) keywords_set = set([keyword.lower() for keyword in all_keywords if keyword]) return keywords_set def keywords_readme(self): """ Return the list of words and constituents considered as clues of a comparison (from listOfkeywords.txt). """ keywords = [] raw_text = self.open("listOfkeywords.txt").read() for line in raw_text.split("\n"): if not line or line.startswith("//"): continue keywords.append(line.strip()) return keywords def raw(self, fileids=None): """ :param fileids: a list or regexp specifying the fileids that have to be returned as a raw string. :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def readme(self): """ Return the contents of the corpus readme file. """ return self.open("README.txt").read() def sents(self, fileids=None): """ Return all sentences in the corpus. :param fileids: a list or regexp specifying the ids of the files whose sentences have to be returned. :return: all sentences of the corpus as lists of tokens (or as plain strings, if no word tokenizer is specified). :rtype: list(list(str)) or list(str) """ return concat([self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def words(self, fileids=None): """ Return all words and punctuation symbols in the corpus. :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat([self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def _read_comparison_block(self, stream): while True: line = stream.readline() if not line: return [] # end of file. comparison_tags = re.findall(COMPARISON, line) if comparison_tags: grad_comparisons = re.findall(GRAD_COMPARISON, line) non_grad_comparisons = re.findall(NON_GRAD_COMPARISON, line) # Advance to the next line (it contains the comparative sentence) comparison_text = stream.readline().strip() if self._word_tokenizer: comparison_text = self._word_tokenizer.tokenize(comparison_text) # Skip the next line (it contains closing comparison tags) stream.readline() # If gradable comparisons are found, create Comparison instances # and populate their fields comparison_bundle = [] if grad_comparisons: # Each comparison tag has its own relations on a separate line for comp in grad_comparisons: comp_type = int(re.match(r'', comp).group(1)) comparison = Comparison(text=comparison_text, comp_type=comp_type) line = stream.readline() entities_feats = ENTITIES_FEATS.findall(line) if entities_feats: for (code, entity_feat) in entities_feats: if code == '1': comparison.entity_1 = entity_feat.strip() elif code == '2': comparison.entity_2 = entity_feat.strip() elif code == '3': comparison.feature = entity_feat.strip() keyword = KEYWORD.findall(line) if keyword: comparison.keyword = keyword[0] comparison_bundle.append(comparison) # If non-gradable comparisons are found, create a simple Comparison # instance for each one if non_grad_comparisons: for comp in non_grad_comparisons: # comp_type in this case should always be 4. comp_type = int(re.match(r'', comp).group(1)) comparison = Comparison(text=comparison_text, comp_type=comp_type) comparison_bundle.append(comparison) # Flatten the list of comparisons before returning them # return concat([comparison_bundle]) return comparison_bundle def _read_keyword_block(self, stream): keywords = [] for comparison in self._read_comparison_block(stream): keywords.append(comparison.keyword) return keywords def _read_sent_block(self, stream): while True: line = stream.readline() if re.match(STARS, line): while True: line = stream.readline() if re.match(STARS, line): break continue if not re.findall(COMPARISON, line) and not ENTITIES_FEATS.findall(line) \ and not re.findall(CLOSE_COMPARISON, line): if self._sent_tokenizer: return [self._word_tokenizer.tokenize(sent) for sent in self._sent_tokenizer.tokenize(line)] else: return [self._word_tokenizer.tokenize(line)] def _read_word_block(self, stream): words = [] for sent in self._read_sent_block(stream): words.extend(sent) return words nltk-3.1/nltk/corpus/reader/conll.py0000644000076500000240000005207012607224144017270 0ustar sbstaff00000000000000# Natural Language Toolkit: CONLL Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Read CoNLL-style chunk fileids. """ from __future__ import unicode_literals import os import codecs import textwrap from nltk import compat from nltk.tree import Tree from nltk.util import LazyMap, LazyConcatenation from nltk.tag import map_tag from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class ConllCorpusReader(CorpusReader): """ A corpus reader for CoNLL-style files. These files consist of a series of sentences, separated by blank lines. Each sentence is encoded using a table (or "grid") of values, where each line corresponds to a single word, and each column corresponds to an annotation type. The set of columns used by CoNLL-style files can vary from corpus to corpus; the ``ConllCorpusReader`` constructor therefore takes an argument, ``columntypes``, which is used to specify the columns that are used by a given corpus. @todo: Add support for reading from corpora where different parallel files contain different columns. @todo: Possibly add caching of the grid corpus view? This would allow the same grid view to be used by different data access methods (eg words() and parsed_sents() could both share the same grid corpus view object). @todo: Better support for -DOCSTART-. Currently, we just ignore it, but it could be used to define methods that retrieve a document at a time (eg parsed_documents()). """ #///////////////////////////////////////////////////////////////// # Column Types #///////////////////////////////////////////////////////////////// WORDS = 'words' #: column type for words POS = 'pos' #: column type for part-of-speech tags TREE = 'tree' #: column type for parse trees CHUNK = 'chunk' #: column type for chunk structures NE = 'ne' #: column type for named entities SRL = 'srl' #: column type for semantic role labels IGNORE = 'ignore' #: column type for column that should be ignored #: A list of all column types supported by the conll corpus reader. COLUMN_TYPES = (WORDS, POS, TREE, CHUNK, NE, SRL, IGNORE) #///////////////////////////////////////////////////////////////// # Constructor #///////////////////////////////////////////////////////////////// def __init__(self, root, fileids, columntypes, chunk_types=None, root_label='S', pos_in_tree=False, srl_includes_roleset=True, encoding='utf8', tree_class=Tree, tagset=None): for columntype in columntypes: if columntype not in self.COLUMN_TYPES: raise ValueError('Bad column type %r' % columntype) if isinstance(chunk_types, compat.string_types): chunk_types = [chunk_types] self._chunk_types = chunk_types self._colmap = dict((c,i) for (i,c) in enumerate(columntypes)) self._pos_in_tree = pos_in_tree self._root_label = root_label # for chunks self._srl_includes_roleset = srl_includes_roleset self._tree_class = tree_class CorpusReader.__init__(self, root, fileids, encoding) self._tagset = tagset #///////////////////////////////////////////////////////////////// # Data Access Methods #///////////////////////////////////////////////////////////////// def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def words(self, fileids=None): self._require(self.WORDS) return LazyConcatenation(LazyMap(self._get_words, self._grids(fileids))) def sents(self, fileids=None): self._require(self.WORDS) return LazyMap(self._get_words, self._grids(fileids)) def tagged_words(self, fileids=None, tagset=None): self._require(self.WORDS, self.POS) def get_tagged_words(grid): return self._get_tagged_words(grid, tagset) return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids))) def tagged_sents(self, fileids=None, tagset=None): self._require(self.WORDS, self.POS) def get_tagged_words(grid): return self._get_tagged_words(grid, tagset) return LazyMap(get_tagged_words, self._grids(fileids)) def chunked_words(self, fileids=None, chunk_types=None, tagset=None): self._require(self.WORDS, self.POS, self.CHUNK) if chunk_types is None: chunk_types = self._chunk_types def get_chunked_words(grid): # capture chunk_types as local var return self._get_chunked_words(grid, chunk_types, tagset) return LazyConcatenation(LazyMap(get_chunked_words, self._grids(fileids))) def chunked_sents(self, fileids=None, chunk_types=None, tagset=None): self._require(self.WORDS, self.POS, self.CHUNK) if chunk_types is None: chunk_types = self._chunk_types def get_chunked_words(grid): # capture chunk_types as local var return self._get_chunked_words(grid, chunk_types, tagset) return LazyMap(get_chunked_words, self._grids(fileids)) def parsed_sents(self, fileids=None, pos_in_tree=None, tagset=None): self._require(self.WORDS, self.POS, self.TREE) if pos_in_tree is None: pos_in_tree = self._pos_in_tree def get_parsed_sent(grid): # capture pos_in_tree as local var return self._get_parsed_sent(grid, pos_in_tree, tagset) return LazyMap(get_parsed_sent, self._grids(fileids)) def srl_spans(self, fileids=None): self._require(self.SRL) return LazyMap(self._get_srl_spans, self._grids(fileids)) def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True): self._require(self.WORDS, self.POS, self.TREE, self.SRL) if pos_in_tree is None: pos_in_tree = self._pos_in_tree def get_srl_instances(grid): # capture pos_in_tree as local var return self._get_srl_instances(grid, pos_in_tree) result = LazyMap(get_srl_instances, self._grids(fileids)) if flatten: result = LazyConcatenation(result) return result def iob_words(self, fileids=None, tagset=None): """ :return: a list of word/tag/IOB tuples :rtype: list(tuple) :param fileids: the list of fileids that make up this corpus :type fileids: None or str or list """ self._require(self.WORDS, self.POS, self.CHUNK) def get_iob_words(grid): return self._get_iob_words(grid, tagset) return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids))) def iob_sents(self, fileids=None, tagset=None): """ :return: a list of lists of word/tag/IOB tuples :rtype: list(list) :param fileids: the list of fileids that make up this corpus :type fileids: None or str or list """ self._require(self.WORDS, self.POS, self.CHUNK) def get_iob_words(grid): return self._get_iob_words(grid, tagset) return LazyMap(get_iob_words, self._grids(fileids)) #///////////////////////////////////////////////////////////////// # Grid Reading #///////////////////////////////////////////////////////////////// def _grids(self, fileids=None): # n.b.: we could cache the object returned here (keyed on # fileids), which would let us reuse the same corpus view for # different things (eg srl and parse trees). return concat([StreamBackedCorpusView(fileid, self._read_grid_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)]) def _read_grid_block(self, stream): grids = [] for block in read_blankline_block(stream): block = block.strip() if not block: continue grid = [line.split() for line in block.split('\n')] # If there's a docstart row, then discard. ([xx] eventually it # would be good to actually use it) if grid[0][self._colmap.get('words', 0)] == '-DOCSTART-': del grid[0] # Check that the grid is consistent. for row in grid: if len(row) != len(grid[0]): raise ValueError('Inconsistent number of columns:\n%s' % block) grids.append(grid) return grids #///////////////////////////////////////////////////////////////// # Transforms #///////////////////////////////////////////////////////////////// # given a grid, transform it into some representation (e.g., # a list of words or a parse tree). def _get_words(self, grid): return self._get_column(grid, self._colmap['words']) def _get_tagged_words(self, grid, tagset=None): pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list(zip(self._get_column(grid, self._colmap['words']), pos_tags)) def _get_iob_words(self, grid, tagset=None): pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] return list(zip(self._get_column(grid, self._colmap['words']), pos_tags, self._get_column(grid, self._colmap['chunk']))) def _get_chunked_words(self, grid, chunk_types, tagset=None): # n.b.: this method is very similar to conllstr2tree. words = self._get_column(grid, self._colmap['words']) pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] chunk_tags = self._get_column(grid, self._colmap['chunk']) stack = [Tree(self._root_label, [])] for (word, pos_tag, chunk_tag) in zip(words, pos_tags, chunk_tags): if chunk_tag == 'O': state, chunk_type = 'O', '' else: (state, chunk_type) = chunk_tag.split('-') # If it's a chunk we don't care about, treat it as O. if chunk_types is not None and chunk_type not in chunk_types: state = 'O' # Treat a mismatching I like a B. if state == 'I' and chunk_type != stack[-1].label(): state = 'B' # For B or I: close any open chunks if state in 'BO' and len(stack) == 2: stack.pop() # For B: start a new chunk. if state == 'B': new_chunk = Tree(chunk_type, []) stack[-1].append(new_chunk) stack.append(new_chunk) # Add the word token. stack[-1].append((word, pos_tag)) return stack[0] def _get_parsed_sent(self, grid, pos_in_tree, tagset=None): words = self._get_column(grid, self._colmap['words']) pos_tags = self._get_column(grid, self._colmap['pos']) if tagset and tagset != self._tagset: pos_tags = [map_tag(self._tagset, tagset, t) for t in pos_tags] parse_tags = self._get_column(grid, self._colmap['tree']) treestr = '' for (word, pos_tag, parse_tag) in zip(words, pos_tags, parse_tags): if word == '(': word = '-LRB-' if word == ')': word = '-RRB-' if pos_tag == '(': pos_tag = '-LRB-' if pos_tag == ')': pos_tag = '-RRB-' (left, right) = parse_tag.split('*') right = right.count(')')*')' # only keep ')'. treestr += '%s (%s %s) %s' % (left, pos_tag, word, right) try: tree = self._tree_class.parse(treestr) except (ValueError, IndexError): tree = self._tree_class.parse('(%s %s)' % (self._root_label, treestr)) if not pos_in_tree: for subtree in tree.subtrees(): for i, child in enumerate(subtree): if (isinstance(child, Tree) and len(child)==1 and isinstance(child[0], compat.string_types)): subtree[i] = (child[0], child.label()) return tree def _get_srl_spans(self, grid): """ list of list of (start, end), tag) tuples """ if self._srl_includes_roleset: predicates = self._get_column(grid, self._colmap['srl']+1) start_col = self._colmap['srl']+2 else: predicates = self._get_column(grid, self._colmap['srl']) start_col = self._colmap['srl']+1 # Count how many predicates there are. This tells us how many # columns to expect for SRL data. num_preds = len([p for p in predicates if p != '-']) spanlists = [] for i in range(num_preds): col = self._get_column(grid, start_col+i) spanlist = [] stack = [] for wordnum, srl_tag in enumerate(col): (left, right) = srl_tag.split('*') for tag in left.split('('): if tag: stack.append((tag, wordnum)) for i in range(right.count(')')): (tag, start) = stack.pop() spanlist.append( ((start, wordnum+1), tag) ) spanlists.append(spanlist) return spanlists def _get_srl_instances(self, grid, pos_in_tree): tree = self._get_parsed_sent(grid, pos_in_tree) spanlists = self._get_srl_spans(grid) if self._srl_includes_roleset: predicates = self._get_column(grid, self._colmap['srl']+1) rolesets = self._get_column(grid, self._colmap['srl']) else: predicates = self._get_column(grid, self._colmap['srl']) rolesets = [None] * len(predicates) instances = ConllSRLInstanceList(tree) for wordnum, predicate in enumerate(predicates): if predicate == '-': continue # Decide which spanlist to use. Don't assume that they're # sorted in the same order as the predicates (even though # they usually are). for spanlist in spanlists: for (start, end), tag in spanlist: if wordnum in range(start,end) and tag in ('V', 'C-V'): break else: continue break else: raise ValueError('No srl column found for %r' % predicate) instances.append(ConllSRLInstance(tree, wordnum, predicate, rolesets[wordnum], spanlist)) return instances #///////////////////////////////////////////////////////////////// # Helper Methods #///////////////////////////////////////////////////////////////// def _require(self, *columntypes): for columntype in columntypes: if columntype not in self._colmap: raise ValueError('This corpus does not contain a %s ' 'column.' % columntype) @staticmethod def _get_column(grid, column_index): return [grid[i][column_index] for i in range(len(grid))] @compat.python_2_unicode_compatible class ConllSRLInstance(object): """ An SRL instance from a CoNLL corpus, which identifies and providing labels for the arguments of a single verb. """ # [xx] add inst.core_arguments, inst.argm_arguments? def __init__(self, tree, verb_head, verb_stem, roleset, tagged_spans): self.verb = [] """A list of the word indices of the words that compose the verb whose arguments are identified by this instance. This will contain multiple word indices when multi-word verbs are used (e.g. 'turn on').""" self.verb_head = verb_head """The word index of the head word of the verb whose arguments are identified by this instance. E.g., for a sentence that uses the verb 'turn on,' ``verb_head`` will be the word index of the word 'turn'.""" self.verb_stem = verb_stem self.roleset = roleset self.arguments = [] """A list of ``(argspan, argid)`` tuples, specifying the location and type for each of the arguments identified by this instance. ``argspan`` is a tuple ``start, end``, indicating that the argument consists of the ``words[start:end]``.""" self.tagged_spans = tagged_spans """A list of ``(span, id)`` tuples, specifying the location and type for each of the arguments, as well as the verb pieces, that make up this instance.""" self.tree = tree """The parse tree for the sentence containing this instance.""" self.words = tree.leaves() """A list of the words in the sentence containing this instance.""" # Fill in the self.verb and self.arguments values. for (start, end), tag in tagged_spans: if tag in ('V', 'C-V'): self.verb += list(range(start, end)) else: self.arguments.append( ((start, end), tag) ) def __repr__(self): plural = len(self.arguments)!=1 and 's' or '' return '' % ( (self.verb_stem, len(self.arguments), plural)) def pprint(self): verbstr = ' '.join(self.words[i][0] for i in self.verb) hdr = 'SRL for %r (stem=%r):\n' % (verbstr, self.verb_stem) s = '' for i, word in enumerate(self.words): if isinstance(word, tuple): word = word[0] for (start, end), argid in self.arguments: if i == start: s += '[%s ' % argid if i == end: s += '] ' if i in self.verb: word = '<<%s>>' % word s += word + ' ' return hdr + textwrap.fill(s.replace(' ]', ']'), initial_indent=' ', subsequent_indent=' ') @compat.python_2_unicode_compatible class ConllSRLInstanceList(list): """ Set of instances for a single sentence """ def __init__(self, tree, instances=()): self.tree = tree list.__init__(self, instances) def __str__(self): return self.pprint() def pprint(self, include_tree=False): # Sanity check: trees should be the same for inst in self: if inst.tree != self.tree: raise ValueError('Tree mismatch!') # If desired, add trees: if include_tree: words = self.tree.leaves() pos = [None] * len(words) synt = ['*'] * len(words) self._tree2conll(self.tree, 0, words, pos, synt) s = '' for i in range(len(words)): # optional tree columns if include_tree: s += '%-20s ' % words[i] s += '%-8s ' % pos[i] s += '%15s*%-8s ' % tuple(synt[i].split('*')) # verb head column for inst in self: if i == inst.verb_head: s += '%-20s ' % inst.verb_stem break else: s += '%-20s ' % '-' # Remaining columns: self for inst in self: argstr = '*' for (start, end), argid in inst.tagged_spans: if i==start: argstr = '(%s%s' % (argid, argstr) if i==(end-1): argstr += ')' s += '%-12s ' % argstr s += '\n' return s def _tree2conll(self, tree, wordnum, words, pos, synt): assert isinstance(tree, Tree) if len(tree) == 1 and isinstance(tree[0], compat.string_types): pos[wordnum] = tree.label() assert words[wordnum] == tree[0] return wordnum+1 elif len(tree) == 1 and isinstance(tree[0], tuple): assert len(tree[0]) == 2 pos[wordnum], pos[wordnum] = tree[0] return wordnum+1 else: synt[wordnum] = '(%s%s' % (tree.label(), synt[wordnum]) for child in tree: wordnum = self._tree2conll(child, wordnum, words, pos, synt) synt[wordnum-1] += ')' return wordnum class ConllChunkCorpusReader(ConllCorpusReader): """ A ConllCorpusReader whose data file contains three columns: words, pos, and chunk. """ def __init__(self, root, fileids, chunk_types, encoding='utf8', tagset=None): ConllCorpusReader.__init__( self, root, fileids, ('words', 'pos', 'chunk'), chunk_types=chunk_types, encoding=encoding, tagset=tagset) nltk-3.1/nltk/corpus/reader/crubadan.py0000644000076500000240000000755212607224144017745 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: An Crubadan N-grams Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Avital Pekker # # URL: # For license information, see LICENSE.TXT """ An NLTK interface for the n-gram statistics gathered from the corpora for each language using An Crubadan. There are multiple potential applications for the data but this reader was created with the goal of using it in the context of language identification. For details about An Crubadan, this data, and its potential uses, see: http://borel.slu.edu/crubadan/index.html """ from __future__ import print_function, unicode_literals import re from nltk.compat import PY3 from os import path from nltk.corpus.reader import CorpusReader from nltk.probability import FreqDist from nltk.data import ZipFilePathPointer class CrubadanCorpusReader(CorpusReader): """ A corpus reader used to access language An Crubadan n-gram files. """ _LANG_MAPPER_FILE = 'table.txt' _all_lang_freq = {} def __init__(self, root, fileids, encoding='utf8', tagset=None): super(CrubadanCorpusReader, self).__init__(root, fileids, encoding='utf8') self._lang_mapping_data = [] self._load_lang_mapping_data() def lang_freq(self, lang): ''' Return n-gram FreqDist for a specific language given ISO 639-3 language code ''' if lang not in self._all_lang_freq: self._all_lang_freq[lang] = self._load_lang_ngrams(lang) return self._all_lang_freq[lang] def langs(self): ''' Return a list of supported languages as ISO 639-3 codes ''' return [row[1] for row in self._lang_mapping_data] def iso_to_crubadan(self, lang): ''' Return internal Crubadan code based on ISO 639-3 code ''' for i in self._lang_mapping_data: if i[1].lower() == lang.lower(): return i[0] def crubadan_to_iso(self, lang): ''' Return ISO 639-3 code given internal Crubadan code ''' for i in self._lang_mapping_data: if i[0].lower() == lang.lower(): return i[1] def _load_lang_mapping_data(self): ''' Load language mappings between codes and description from table.txt ''' if isinstance(self.root, ZipFilePathPointer): raise RuntimeError("Please install the 'crubadan' corpus first, use nltk.download()") mapper_file = path.join(self.root, self._LANG_MAPPER_FILE) if self._LANG_MAPPER_FILE not in self.fileids(): raise RuntimeError("Could not find language mapper file: " + mapper_file) if PY3: raw = open(mapper_file, 'r', encoding='utf-8').read().strip() else: raw = open(mapper_file, 'rU').read().decode('utf-8').strip() self._lang_mapping_data = [row.split('\t') for row in raw.split('\n')] def _load_lang_ngrams(self, lang): ''' Load single n-gram language file given the ISO 639-3 language code and return its FreqDist ''' if lang not in self.langs(): raise RuntimeError("Unsupported language.") crubadan_code = self.iso_to_crubadan(lang) ngram_file = path.join(self.root, crubadan_code + '-3grams.txt') if not path.isfile(ngram_file): raise Runtime("No N-gram file found for requested language.") counts = FreqDist() if PY3: f = open(ngram_file, 'r', encoding='utf-8') else: f = open(ngram_file, 'rU') for line in f: if PY3: data = line.split(' ') else: data = line.decode('utf8').split(' ') ngram = data[1].strip('\n') freq = int(data[0]) counts[ngram] = freq return counts nltk-3.1/nltk/corpus/reader/dependency.py0000644000076500000240000000764512607224144020307 0ustar sbstaff00000000000000# Natural Language Toolkit: Dependency Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Kepa Sarasola # Iker Manterola # # URL: # For license information, see LICENSE.TXT import codecs from nltk.parse import DependencyGraph from nltk.tokenize import * from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class DependencyCorpusReader(SyntaxCorpusReader): def __init__(self, root, fileids, encoding='utf8', word_tokenizer=TabTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), para_block_reader=read_blankline_block): CorpusReader.__init__(self, root, fileids, encoding) ######################################################### def raw(self, fileids=None): """ :return: the given file(s) as a single string. :rtype: str """ result = [] for fileid, encoding in self.abspaths(fileids, include_encoding=True): if isinstance(fileid, PathPointer): result.append(fileid.open(encoding=encoding).read()) else: with codecs.open(fileid, "r", encoding) as fp: result.append(fp.read()) return concat(result) def words(self, fileids=None): return concat([DependencyCorpusView(fileid, False, False, False, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True)]) def tagged_words(self, fileids=None): return concat([DependencyCorpusView(fileid, True, False, False, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True)]) def sents(self, fileids=None): return concat([DependencyCorpusView(fileid, False, True, False, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True)]) def tagged_sents(self, fileids=None): return concat([DependencyCorpusView(fileid, True, True, False, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True)]) def parsed_sents(self, fileids=None): sents=concat([DependencyCorpusView(fileid, False, True, True, encoding=enc) for fileid, enc in self.abspaths(fileids, include_encoding=True)]) return [DependencyGraph(sent) for sent in sents] class DependencyCorpusView(StreamBackedCorpusView): _DOCSTART = '-DOCSTART- -DOCSTART- O\n' #dokumentu hasiera definitzen da def __init__(self, corpus_file, tagged, group_by_sent, dependencies, chunk_types=None, encoding='utf8'): self._tagged = tagged self._dependencies = dependencies self._group_by_sent = group_by_sent self._chunk_types = chunk_types StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): # Read the next sentence. sent = read_blankline_block(stream)[0].strip() # Strip off the docstart marker, if present. if sent.startswith(self._DOCSTART): sent = sent[len(self._DOCSTART):].lstrip() # extract word and tag from any of the formats if not self._dependencies: lines = [line.split('\t') for line in sent.split('\n')] if len(lines[0]) == 3 or len(lines[0]) == 4: sent = [(line[0], line[1]) for line in lines] elif len(lines[0]) == 10: sent = [(line[1], line[4]) for line in lines] else: raise ValueError('Unexpected number of fields in dependency tree file') # discard tags if they weren't requested if not self._tagged: sent = [word for (word, tag) in sent] # Return the result. if self._group_by_sent: return [sent] else: return list(sent) nltk-3.1/nltk/corpus/reader/framenet.py0000644000076500000240000024240712607224144017767 0ustar sbstaff00000000000000# Natural Language Toolkit: Framenet Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Authors: Chuck Wooters , # Nathan Schneider # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals """ Corpus reader for the Framenet 1.5 Corpus. """ __docformat__ = 'epytext en' import os, sys import re import textwrap from collections import defaultdict from pprint import pprint, pformat from nltk.internals import ElementWrapper from nltk.corpus.reader import XMLCorpusReader, XMLCorpusView from nltk.compat import text_type, string_types, python_2_unicode_compatible from nltk.util import AbstractLazySequence, LazyMap def _pretty_longstring(defstr, prefix='', wrap_at=65): """ Helper function for pretty-printing a long string. :param defstr: The string to be printed. :type defstr: str :return: A nicely formated string representation of the long string. :rtype: str """ outstr = "" for line in textwrap.fill(defstr, wrap_at).split('\n'): outstr += prefix + line + '\n' return outstr def _pretty_any(obj): """ Helper function for pretty-printing any AttrDict object. :param obj: The obj to be printed. :type obj: AttrDict :return: A nicely formated string representation of the AttrDict object. :rtype: str """ outstr = "" for k in obj: if isinstance(obj[k], string_types) and len(obj[k]) > 65: outstr += "[{0}]\n".format(k) outstr += "{0}".format(_pretty_longstring(obj[k], prefix=' ')) outstr += '\n' else: outstr += "[{0}] {1}\n".format(k, obj[k]) return outstr def _pretty_semtype(st): """ Helper function for pretty-printing a semantic type. :param st: The semantic type to be printed. :type st: AttrDict :return: A nicely formated string representation of the semantic type. :rtype: str """ semkeys = st.keys() if len(semkeys) == 1: return "" outstr = "" outstr += "semantic type ({0.ID}): {0.name}\n".format(st) if 'abbrev' in semkeys: outstr += "[abbrev] {0}\n".format(st.abbrev) if 'definition' in semkeys: outstr += "[definition]\n" outstr += _pretty_longstring(st.definition,' ') outstr += "[rootType] {0}({1})\n".format(st.rootType.name, st.rootType.ID) if st.superType is None: outstr += "[superType] \n" else: outstr += "[superType] {0}({1})\n".format(st.superType.name, st.superType.ID) outstr += "[subTypes] {0} subtypes\n".format(len(st.subTypes)) outstr += " " + ", ".join('{0}({1})'.format(x.name, x.ID) for x in st.subTypes) + '\n'*(len(st.subTypes)>0) return outstr def _pretty_frame_relation_type(freltyp): """ Helper function for pretty-printing a frame relation type. :param freltyp: The frame relation type to be printed. :type freltyp: AttrDict :return: A nicely formated string representation of the frame relation type. :rtype: str """ outstr = " {0.subFrameName}>".format(freltyp) return outstr def _pretty_frame_relation(frel): """ Helper function for pretty-printing a frame relation. :param frel: The frame relation to be printed. :type frel: AttrDict :return: A nicely formated string representation of the frame relation. :rtype: str """ outstr = "<{0.type.superFrameName}={0.superFrameName} -- {0.type.name} -> {0.type.subFrameName}={0.subFrameName}>".format(frel) return outstr def _pretty_fe_relation(ferel): """ Helper function for pretty-printing an FE relation. :param ferel: The FE relation to be printed. :type ferel: AttrDict :return: A nicely formated string representation of the FE relation. :rtype: str """ outstr = "<{0.type.superFrameName}={0.frameRelation.superFrameName}.{0.superFEName} -- {0.type.name} -> {0.type.subFrameName}={0.frameRelation.subFrameName}.{0.subFEName}>".format(ferel) return outstr def _pretty_lu(lu): """ Helper function for pretty-printing a lexical unit. :param lu: The lu to be printed. :type lu: AttrDict :return: A nicely formated string representation of the lexical unit. :rtype: str """ lukeys = lu.keys() outstr = "" outstr += "lexical unit ({0.ID}): {0.name}\n\n".format(lu) if 'definition' in lukeys: outstr += "[definition]\n" outstr += _pretty_longstring(lu.definition,' ') if 'frame' in lukeys: outstr += "\n[frame] {0}({1})\n".format(lu.frame.name,lu.frame.ID) if 'incorporatedFE' in lukeys: outstr += "\n[incorporatedFE] {0}\n".format(lu.incorporatedFE) if 'POS' in lukeys: outstr += "\n[POS] {0}\n".format(lu.POS) if 'status' in lukeys: outstr += "\n[status] {0}\n".format(lu.status) if 'totalAnnotated' in lukeys: outstr += "\n[totalAnnotated] {0} annotated examples\n".format(lu.totalAnnotated) if 'lexemes' in lukeys: outstr += "\n[lexemes] {0}\n".format(' '.join('{0}/{1}'.format(lex.name,lex.POS) for lex in lu.lexemes)) if 'semTypes' in lukeys: outstr += "\n[semTypes] {0} semantic types\n".format(len(lu.semTypes)) outstr += " "*(len(lu.semTypes)>0) + ", ".join('{0}({1})'.format(x.name, x.ID) for x in lu.semTypes) + '\n'*(len(lu.semTypes)>0) if 'subCorpus' in lukeys: subc = [x.name for x in lu.subCorpus] outstr += "\n[subCorpus] {0} subcorpora\n".format(len(lu.subCorpus)) for line in textwrap.fill(", ".join(sorted(subc)), 60).split('\n'): outstr += " {0}\n".format(line) return outstr def _pretty_fe(fe): """ Helper function for pretty-printing a frame element. :param fe: The frame element to be printed. :type fe: AttrDict :return: A nicely formated string representation of the frame element. :rtype: str """ fekeys = fe.keys() outstr = "" outstr += "frame element ({0.ID}): {0.name}\n of {1.name}({1.ID})\n".format(fe, fe.frame) if 'definition' in fekeys: outstr += "[definition]\n" outstr += _pretty_longstring(fe.definition,' ') if 'abbrev' in fekeys: outstr += "[abbrev] {0}\n".format(fe.abbrev) if 'coreType' in fekeys: outstr += "[coreType] {0}\n".format(fe.coreType) if 'requiresFE' in fekeys: outstr += "[requiresFE] " if fe.requiresFE is None: outstr += "\n" else: outstr += "{0}({1})\n".format(fe.requiresFE.name, fe.requiresFE.ID) if 'excludesFE' in fekeys: outstr += "[excludesFE] " if fe.excludesFE is None: outstr += "\n" else: outstr += "{0}({1})\n".format(fe.excludesFE.name, fe.excludesFE.ID) if 'semType' in fekeys: outstr += "[semType] " if fe.semType is None: outstr += "\n" else: outstr += "\n " + "{0}({1})".format(fe.semType.name, fe.semType.ID) + '\n' return outstr def _pretty_frame(frame): """ Helper function for pretty-printing a frame. :param frame: The frame to be printed. :type frame: AttrDict :return: A nicely formated string representation of the frame. :rtype: str """ outstr = "" outstr += "frame ({0.ID}): {0.name}\n\n".format(frame) outstr += "[definition]\n" outstr += _pretty_longstring(frame.definition, ' ') + '\n' outstr += "[semTypes] {0} semantic types\n".format(len(frame.semTypes)) outstr += " "*(len(frame.semTypes)>0) + ", ".join("{0}({1})".format(x.name, x.ID) for x in frame.semTypes) + '\n'*(len(frame.semTypes)>0) outstr += "\n[frameRelations] {0} frame relations\n".format(len(frame.frameRelations)) outstr += ' ' + '\n '.join(repr(frel) for frel in frame.frameRelations) + '\n' outstr += "\n[lexUnit] {0} lexical units\n".format(len(frame.lexUnit)) lustrs = [] for luName,lu in sorted(frame.lexUnit.items()): tmpstr = '{0} ({1})'.format(luName, lu.ID) lustrs.append(tmpstr) outstr += "{0}\n".format(_pretty_longstring(', '.join(lustrs),prefix=' ')) outstr += "\n[FE] {0} frame elements\n".format(len(frame.FE)) fes = {} for feName,fe in sorted(frame.FE.items()): try: fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID)) except KeyError: fes[fe.coreType] = [] fes[fe.coreType].append("{0} ({1})".format(feName, fe.ID)) for ct in sorted(fes.keys(), key=lambda ct2: ['Core','Core-Unexpressed','Peripheral','Extra-Thematic'].index(ct2)): outstr += "{0:>16}: {1}\n".format(ct, ', '.join(sorted(fes[ct]))) outstr += "\n[FEcoreSets] {0} frame element core sets\n".format(len(frame.FEcoreSets)) outstr += " " + '\n '.join(", ".join([x.name for x in coreSet]) for coreSet in frame.FEcoreSets) + '\n' return outstr class FramenetError(Exception): """An exception class for framenet-related errors.""" @python_2_unicode_compatible class AttrDict(dict): """A class that wraps a dict and allows accessing the keys of the dict as if they were attributes. Taken from here: http://stackoverflow.com/a/14620633/8879 >>> foo = {'a':1, 'b':2, 'c':3} >>> bar = AttrDict(foo) >>> pprint(dict(bar)) {'a': 1, 'b': 2, 'c': 3} >>> bar.b 2 >>> bar.d = 4 >>> pprint(dict(bar)) {'a': 1, 'b': 2, 'c': 3, 'd': 4} """ def __init__(self, *args, **kwargs): super(AttrDict, self).__init__(*args, **kwargs) #self.__dict__ = self def __setattr__(self, name, value): self[name] = value def __getattr__(self, name): if name=='_short_repr': return self._short_repr return self[name] def __getitem__(self, name): v = super(AttrDict,self).__getitem__(name) if isinstance(v,Future): return v._data() return v def _short_repr(self): if '_type' in self: if self['_type'].endswith('relation'): return self.__repr__() try: return "<{0} ID={1} name={2}>".format(self['_type'], self['ID'], self['name']) except KeyError: # no ID--e.g., for _type=lusubcorpus return "<{0} name={1}>".format(self['_type'], self['name']) else: return self.__repr__() def _str(self): outstr = "" if not '_type' in self: outstr = _pretty_any(self) elif self['_type'] == 'frame': outstr = _pretty_frame(self) elif self['_type'] == 'fe': outstr = _pretty_fe(self) elif self['_type'] == 'lu': outstr = _pretty_lu(self) elif self['_type'] == 'semtype': outstr = _pretty_semtype(self) elif self['_type'] == 'framerelationtype': outstr = _pretty_frame_relation_type(self) elif self['_type'] == 'framerelation': outstr = _pretty_frame_relation(self) elif self['_type'] == 'ferelation': outstr = _pretty_fe_relation(self) else: outstr = _pretty_any(self) # ensure result is unicode string prior to applying the # @python_2_unicode_compatible decorator (because non-ASCII characters # could in principle occur in the data and would trigger an encoding error when # passed as arguments to str.format()). # assert isinstance(outstr, unicode) # not in Python 3.2 return outstr def __str__(self): return self._str() def __repr__(self): return self.__str__() class Future(object): """ Wraps and acts as a proxy for a value to be loaded lazily (on demand). Adapted from https://gist.github.com/sergey-miryanov/2935416 """ def __init__(self, loader, *args, **kwargs): """ :param loader: when called with no arguments, returns the value to be stored :type loader: callable """ super (Future, self).__init__(*args, **kwargs) self._loader = loader self._d = None def _data(self): if callable(self._loader): self._d = self._loader() self._loader = None # the data is now cached return self._d def __nonzero__(self): return bool(self._data()) def __len__(self): return len(self._data()) def __setitem__(self, key, value): return self._data ().__setitem__(key, value) def __getitem__(self, key): return self._data ().__getitem__(key) def __getattr__(self, key): return self._data().__getattr__(key) def __str__(self): return self._data().__str__() def __repr__(self): return self._data().__repr__() @python_2_unicode_compatible class PrettyDict(AttrDict): """ Displays an abbreviated repr of values where possible. Inherits from AttrDict, so a callable value will be lazily converted to an actual value. """ def __init__(self, *args, **kwargs): _BREAK_LINES = kwargs.pop('breakLines', False) super(PrettyDict, self).__init__(*args, **kwargs) dict.__setattr__(self, '_BREAK_LINES', _BREAK_LINES) def __repr__(self): parts = [] for k,v in sorted(self.items()): kv = repr(k)+': ' try: kv += v._short_repr() except AttributeError: kv += repr(v) parts.append(kv) return '{'+(',\n ' if self._BREAK_LINES else ', ').join(parts)+'}' @python_2_unicode_compatible class PrettyList(list): """ Displays an abbreviated repr of only the first several elements, not the whole list. """ # from nltk.util def __init__(self, *args, **kwargs): self._MAX_REPR_SIZE = kwargs.pop('maxReprSize', 60) self._BREAK_LINES = kwargs.pop('breakLines', False) super(PrettyList, self).__init__(*args, **kwargs) def __repr__(self): """ Return a string representation for this corpus view that is similar to a list's representation; but if it would be more than 60 characters long, it is truncated. """ pieces = [] length = 5 for elt in self: pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr() length += len(pieces[-1]) + 2 if self._MAX_REPR_SIZE and length > self._MAX_REPR_SIZE and len(pieces) > 2: return "[%s, ...]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces[:-1]) return "[%s]" % text_type(',\n ' if self._BREAK_LINES else ', ').join(pieces) @python_2_unicode_compatible class PrettyLazyMap(LazyMap): """ Displays an abbreviated repr of only the first several elements, not the whole list. """ # from nltk.util _MAX_REPR_SIZE = 60 def __repr__(self): """ Return a string representation for this corpus view that is similar to a list's representation; but if it would be more than 60 characters long, it is truncated. """ pieces = [] length = 5 for elt in self: pieces.append(elt._short_repr()) # key difference from inherited version: call to _short_repr() length += len(pieces[-1]) + 2 if length > self._MAX_REPR_SIZE and len(pieces) > 2: return "[%s, ...]" % text_type(', ').join(pieces[:-1]) else: return "[%s]" % text_type(', ').join(pieces) class FramenetCorpusReader(XMLCorpusReader): """A corpus reader for the Framenet Corpus. >>> from nltk.corpus import framenet as fn >>> fn.lu(3238).frame.lexUnit['glint.v'] is fn.lu(3238) True >>> fn.frame_by_name('Replacing') is fn.lus('replace.v')[0].frame True >>> fn.lus('prejudice.n')[0].frame.frameRelations == fn.frame_relations('Partiality') True """ _bad_statuses = ['Problem'] """ When loading LUs for a frame, those whose status is in this list will be ignored. Due to caching, if user code modifies this, it should do so before loading any data. 'Problem' should always be listed for FrameNet 1.5, as these LUs are not included in the XML index. """ def __init__(self, root, fileids): XMLCorpusReader.__init__(self, root, fileids) # framenet corpus sub dirs # sub dir containing the xml files for frames self._frame_dir = "frame" # sub dir containing the xml files for lexical units self._lu_dir = "lu" # sub dir containing the xml files for fulltext annotation files self._fulltext_dir = "fulltext" # Indexes used for faster look-ups self._frame_idx = None self._cached_frames = {} # name -> ID self._lu_idx = None self._fulltext_idx = None self._semtypes = None self._freltyp_idx = None # frame relation types (Inheritance, Using, etc.) self._frel_idx = None # frame-to-frame relation instances self._ferel_idx = None # FE-to-FE relation instances self._frel_f_idx = None # frame-to-frame relations associated with each frame def _buildframeindex(self): # The total number of Frames in Framenet is fairly small (~1200) so # this index should not be very large if not self._frel_idx: self._buildrelationindex() # always load frame relations before frames, # otherwise weird ordering effects might result in incomplete information self._frame_idx = {} for f in XMLCorpusView(self.abspath("frameIndex.xml"), 'frameIndex/frame', self._handle_elt): self._frame_idx[f['ID']] = f def _buildcorpusindex(self): # The total number of fulltext annotated documents in Framenet # is fairly small (~90) so this index should not be very large self._fulltext_idx = {} for doclist in XMLCorpusView(self.abspath("fulltextIndex.xml"), 'fulltextIndex/corpus', self._handle_fulltextindex_elt): for doc in doclist: self._fulltext_idx[doc.ID] = doc def _buildluindex(self): # The number of LUs in Framenet is about 13,000 so this index # should not be very large self._lu_idx = {} for lu in XMLCorpusView(self.abspath("luIndex.xml"), 'luIndex/lu', self._handle_elt): self._lu_idx[lu['ID']] = lu # populate with LU index entries. if any of these # are looked up they will be replaced by full LU objects. def _buildrelationindex(self): #print('building relation index...', file=sys.stderr) freltypes = PrettyList(x for x in XMLCorpusView(self.abspath("frRelation.xml"), 'frameRelations/frameRelationType', self._handle_framerelationtype_elt)) self._freltyp_idx = {} self._frel_idx = {} self._frel_f_idx = defaultdict(set) self._ferel_idx = {} for freltyp in freltypes: self._freltyp_idx[freltyp.ID] = freltyp for frel in freltyp.frameRelations: supF = frel.superFrame = frel[freltyp.superFrameName] = Future((lambda fID: lambda: self.frame_by_id(fID))(frel.supID)) subF = frel.subFrame = frel[freltyp.subFrameName] = Future((lambda fID: lambda: self.frame_by_id(fID))(frel.subID)) self._frel_idx[frel.ID] = frel self._frel_f_idx[frel.supID].add(frel.ID) self._frel_f_idx[frel.subID].add(frel.ID) for ferel in frel.feRelations: ferel.superFrame = supF ferel.subFrame = subF ferel.superFE = Future((lambda fer: lambda: fer.superFrame.FE[fer.superFEName])(ferel)) ferel.subFE = Future((lambda fer: lambda: fer.subFrame.FE[fer.subFEName])(ferel)) self._ferel_idx[ferel.ID] = ferel #print('...done building relation index', file=sys.stderr) def readme(self): """ Return the contents of the corpus README.txt (or README) file. """ try: return self.open("README.txt").read() except IOError: return self.open("README").read() def buildindexes(self): """ Build the internal indexes to make look-ups faster. """ # Frames self._buildframeindex() # LUs self._buildluindex() # Fulltext annotation corpora index self._buildcorpusindex() # frame and FE relations self._buildrelationindex() def annotated_document(self, fn_docid): """ Returns the annotated document whose id number is ``fn_docid``. This id number can be obtained by calling the Documents() function. The dict that is returned from this function will contain the following keys: - '_type' : 'fulltextannotation' - 'sentence' : a list of sentences in the document - Each item in the list is a dict containing the following keys: - 'ID' : the ID number of the sentence - '_type' : 'sentence' - 'text' : the text of the sentence - 'paragNo' : the paragraph number - 'sentNo' : the sentence number - 'docID' : the document ID number - 'corpID' : the corpus ID number - 'aPos' : the annotation position - 'annotationSet' : a list of annotation layers for the sentence - Each item in the list is a dict containing the following keys: - 'ID' : the ID number of the annotation set - '_type' : 'annotationset' - 'status' : either 'MANUAL' or 'UNANN' - 'luName' : (only if status is 'MANUAL') - 'luID' : (only if status is 'MANUAL') - 'frameID' : (only if status is 'MANUAL') - 'frameName': (only if status is 'MANUAL') - 'layer' : a list of labels for the layer - Each item in the layer is a dict containing the following keys: - '_type': 'layer' - 'rank' - 'name' - 'label' : a list of labels in the layer - Each item is a dict containing the following keys: - 'start' - 'end' - 'name' - 'feID' (optional) :param fn_docid: The Framenet id number of the document :type fn_docid: int :return: Information about the annotated document :rtype: dict """ try: xmlfname = self._fulltext_idx[fn_docid].filename except TypeError: # happens when self._fulltext_idx == None # build the index self._buildcorpusindex() xmlfname = self._fulltext_idx[fn_docid].filename except KeyError: # probably means that fn_docid was not in the index raise FramenetError("Unknown document id: {0}".format(fn_docid)) # construct the path name for the xml file containing the document info locpath = os.path.join( "{0}".format(self._root), self._fulltext_dir, xmlfname) # Grab the top-level xml element containing the fulltext annotation elt = XMLCorpusView(locpath, 'fullTextAnnotation')[0] return self._handle_fulltextannotation_elt(elt) def frame_by_id(self, fn_fid, ignorekeys=[]): """ Get the details for the specified Frame using the frame's id number. Usage examples: >>> from nltk.corpus import framenet as fn >>> f = fn.frame_by_id(256) >>> f.ID 256 >>> f.name 'Medical_specialties' >>> f.definition "This frame includes words that name ..." :param fn_fid: The Framenet id number of the frame :type fn_fid: int :param ignorekeys: The keys to ignore. These keys will not be included in the output. (optional) :type ignorekeys: list(str) :return: Information about a frame :rtype: dict Also see the ``frame()`` function for details about what is contained in the dict that is returned. """ # get the name of the frame with this id number try: fentry = self._frame_idx[fn_fid] if '_type' in fentry: return fentry # full frame object is cached name = fentry['name'] except TypeError: self._buildframeindex() name = self._frame_idx[fn_fid]['name'] except KeyError: raise FramenetError('Unknown frame id: {0}'.format(fn_fid)) return self.frame_by_name(name, ignorekeys, check_cache=False) def frame_by_name(self, fn_fname, ignorekeys=[], check_cache=True): """ Get the details for the specified Frame using the frame's name. Usage examples: >>> from nltk.corpus import framenet as fn >>> f = fn.frame_by_name('Medical_specialties') >>> f.ID 256 >>> f.name 'Medical_specialties' >>> f.definition "This frame includes words that name ..." :param fn_fname: The name of the frame :type fn_fname: str :param ignorekeys: The keys to ignore. These keys will not be included in the output. (optional) :type ignorekeys: list(str) :return: Information about a frame :rtype: dict Also see the ``frame()`` function for details about what is contained in the dict that is returned. """ if check_cache and fn_fname in self._cached_frames: return self._frame_idx[self._cached_frames[fn_fname]] elif not self._frame_idx: self._buildframeindex() # construct the path name for the xml file containing the Frame info locpath = os.path.join( "{0}".format(self._root), self._frame_dir, fn_fname + ".xml") #print(locpath, file=sys.stderr) # Grab the xml for the frame try: elt = XMLCorpusView(locpath, 'frame')[0] except IOError: raise FramenetError('Unknown frame: {0}'.format(fn_fname)) fentry = self._handle_frame_elt(elt, ignorekeys) assert fentry # INFERENCE RULE: propagate lexical semtypes from the frame to all its LUs for st in fentry.semTypes: if st.rootType.name=='Lexical_type': for lu in fentry.lexUnit.values(): if st not in lu.semTypes: lu.semTypes.append(st) self._frame_idx[fentry.ID] = fentry self._cached_frames[fentry.name] = fentry.ID ''' # now set up callables to resolve the LU pointers lazily. # (could also do this here--caching avoids infinite recursion.) for luName,luinfo in fentry.lexUnit.items(): fentry.lexUnit[luName] = (lambda luID: Future(lambda: self.lu(luID)))(luinfo.ID) ''' return fentry def frame(self, fn_fid_or_fname, ignorekeys=[]): """ Get the details for the specified Frame using the frame's name or id number. Usage examples: >>> from nltk.corpus import framenet as fn >>> f = fn.frame(256) >>> f.name 'Medical_specialties' >>> f = fn.frame('Medical_specialties') >>> f.ID 256 >>> # ensure non-ASCII character in definition doesn't trigger an encoding error: >>> fn.frame('Imposing_obligation') frame (1494): Imposing_obligation... The dict that is returned from this function will contain the following information about the Frame: - 'name' : the name of the Frame (e.g. 'Birth', 'Apply_heat', etc.) - 'definition' : textual definition of the Frame - 'ID' : the internal ID number of the Frame - 'semTypes' : a list of semantic types for this frame - Each item in the list is a dict containing the following keys: - 'name' : can be used with the semtype() function - 'ID' : can be used with the semtype() function - 'lexUnit' : a dict containing all of the LUs for this frame. The keys in this dict are the names of the LUs and the value for each key is itself a dict containing info about the LU (see the lu() function for more info.) - 'FE' : a dict containing the Frame Elements that are part of this frame The keys in this dict are the names of the FEs (e.g. 'Body_system') and the values are dicts containing the following keys - 'definition' : The definition of the FE - 'name' : The name of the FE e.g. 'Body_system' - 'ID' : The id number - '_type' : 'fe' - 'abbrev' : Abbreviation e.g. 'bod' - 'coreType' : one of "Core", "Peripheral", or "Extra-Thematic" - 'semType' : if not None, a dict with the following two keys: - 'name' : name of the semantic type. can be used with the semtype() function - 'ID' : id number of the semantic type. can be used with the semtype() function - 'requiresFE' : if not None, a dict with the following two keys: - 'name' : the name of another FE in this frame - 'ID' : the id of the other FE in this frame - 'excludesFE' : if not None, a dict with the following two keys: - 'name' : the name of another FE in this frame - 'ID' : the id of the other FE in this frame - 'frameRelation' : a list of objects describing frame relations - 'FEcoreSets' : a list of Frame Element core sets for this frame - Each item in the list is a list of FE objects :param fn_fid_or_fname: The Framenet name or id number of the frame :type fn_fid_or_fname: int or str :param ignorekeys: The keys to ignore. These keys will not be included in the output. (optional) :type ignorekeys: list(str) :return: Information about a frame :rtype: dict """ # get the frame info by name or id number if isinstance(fn_fid_or_fname, string_types): f = self.frame_by_name(fn_fid_or_fname, ignorekeys) else: f = self.frame_by_id(fn_fid_or_fname, ignorekeys) return f def frames_by_lemma(self, pat): """ Returns a list of all frames that contain LUs in which the ``name`` attribute of the LU matchs the given regular expression ``pat``. Note that LU names are composed of "lemma.POS", where the "lemma" part can be made up of either a single lexeme (e.g. 'run') or multiple lexemes (e.g. 'a little'). Note: if you are going to be doing a lot of this type of searching, you'd want to build an index that maps from lemmas to frames because each time frames_by_lemma() is called, it has to search through ALL of the frame XML files in the db. >>> from nltk.corpus import framenet as fn >>> fn.frames_by_lemma(r'(?i)a little') [, ] :return: A list of frame objects. :rtype: list(AttrDict) """ return PrettyList(f for f in self.frames() if any(re.search(pat, luName) for luName in f.lexUnit)) def lu_basic(self, fn_luid): """ Returns basic information about the LU whose id is ``fn_luid``. This is basically just a wrapper around the ``lu()`` function with "subCorpus" info excluded. >>> from nltk.corpus import framenet as fn >>> PrettyDict(fn.lu_basic(256), breakLines=True) {'ID': 256, 'POS': 'V', '_type': 'lu', 'definition': 'COD: be aware of beforehand; predict.', 'frame': , 'lemmaID': 15082, 'lexemes': [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}], 'name': 'foresee.v', 'semTypes': [], 'sentenceCount': {'annotated': 44, 'total': 227}, 'status': 'FN1_Sent'} :param fn_luid: The id number of the desired LU :type fn_luid: int :return: Basic information about the lexical unit :rtype: dict """ return self.lu(fn_luid, ignorekeys=['subCorpus']) def lu(self, fn_luid, ignorekeys=[]): """ Get information about a specific Lexical Unit using the id number ``fn_luid``. This function reads the LU information from the xml file on disk each time it is called. You may want to cache this info if you plan to call this function with the same id number multiple times. Usage examples: >>> from nltk.corpus import framenet as fn >>> fn.lu(256).name 'foresee.v' >>> fn.lu(256).definition 'COD: be aware of beforehand; predict.' >>> fn.lu(256).frame.name 'Expectation' >>> pprint(list(map(PrettyDict, fn.lu(256).lexemes))) [{'POS': 'V', 'breakBefore': 'false', 'headword': 'false', 'name': 'foresee', 'order': 1}] The dict that is returned from this function will contain most of the following information about the LU. Note that some LUs do not contain all of these pieces of information - particularly 'totalAnnotated' and 'incorporatedFE' may be missing in some LUs: - 'name' : the name of the LU (e.g. 'merger.n') - 'definition' : textual definition of the LU - 'ID' : the internal ID number of the LU - '_type' : 'lu' - 'status' : e.g. 'Created' - 'frame' : Frame that this LU belongs to - 'POS' : the part of speech of this LU (e.g. 'N') - 'totalAnnotated' : total number of examples annotated with this LU - 'incorporatedFE' : FE that incorporates this LU (e.g. 'Ailment') - 'sentenceCount' : a dict with the following two keys: - 'annotated': number of sentences annotated with this LU - 'total' : total number of sentences with this LU - 'lexemes' : a list of dicts describing the lemma of this LU. Each dict in the list contains these keys: - 'POS' : part of speech e.g. 'N' - 'name' : either single-lexeme e.g. 'merger' or multi-lexeme e.g. 'a little' - 'order': the order of the lexeme in the lemma (starting from 1) - 'headword': a boolean ('true' or 'false') - 'breakBefore': Can this lexeme be separated from the previous lexeme? Consider: "take over.v" as in: Germany took over the Netherlands in 2 days. Germany took the Netherlands over in 2 days. In this case, 'breakBefore' would be "true" for the lexeme "over". Contrast this with "take after.v" as in: Mary takes after her grandmother. *Mary takes her grandmother after. In this case, 'breakBefore' would be "false" for the lexeme "after" - 'lemmaID' : Can be used to connect lemmas in different LUs - 'semTypes' : a list of semantic type objects for this LU - 'subCorpus' : a list of subcorpora - Each item in the list is a dict containing the following keys: - 'name' : - 'sentence' : a list of sentences in the subcorpus - each item in the list is a dict with the following keys: - 'ID': - 'sentNo': - 'text': the text of the sentence - 'aPos': - 'annotationSet': a list of annotation sets - each item in the list is a dict with the following keys: - 'ID': - 'status': - 'layer': a list of layers - each layer is a dict containing the following keys: - 'name': layer name (e.g. 'BNC') - 'rank': - 'label': a list of labels for the layer - each label is a dict containing the following keys: - 'start': start pos of label in sentence 'text' (0-based) - 'end': end pos of label in sentence 'text' (0-based) - 'name': name of label (e.g. 'NN1') Under the hood, this implementation looks up the lexical unit information in the *frame* definition file. That file does not contain corpus annotations, so the LU files will be accessed on demand if those are needed. In principle, valence patterns could be loaded here too, though these are not currently supported. :param fn_luid: The id number of the lexical unit :type fn_luid: int :param ignorekeys: The keys to ignore. These keys will not be included in the output. (optional) :type ignorekeys: list(str) :return: All information about the lexical unit :rtype: dict """ # look for this LU in cache if not self._lu_idx: self._buildluindex() luinfo = self._lu_idx[fn_luid] if '_type' not in luinfo: # we only have an index entry for the LU. loading the frame will replace this. f = self.frame_by_id(luinfo.frameID) luinfo = self._lu_idx[fn_luid] if ignorekeys: return AttrDict(dict((k, v) for k, v in luinfo.items() if k not in ignorekeys)) return luinfo def _lu_file(self, lu, ignorekeys=[]): """ Augment the LU information that was loaded from the frame file with additional information from the LU file. """ fn_luid = lu.ID fname = "lu{0}.xml".format(fn_luid) locpath = os.path.join("{0}".format(self._root), self._lu_dir, fname) #print(locpath, file=sys.stderr) if not self._lu_idx: self._buildluindex() try: elt = XMLCorpusView(locpath, 'lexUnit')[0] except IOError: raise FramenetError('Unknown LU id: {0}'.format(fn_luid)) lu2 = self._handle_lexunit_elt(elt, ignorekeys) lu.subCorpus = lu2.subCorpus return lu.subCorpus def _loadsemtypes(self): """Create the semantic types index.""" self._semtypes = AttrDict() semtypeXML = [x for x in XMLCorpusView(self.abspath("semTypes.xml"), 'semTypes/semType', self._handle_semtype_elt)] for st in semtypeXML: n = st['name'] a = st['abbrev'] i = st['ID'] # Both name and abbrev should be able to retrieve the # ID. The ID will retrieve the semantic type dict itself. self._semtypes[n] = i self._semtypes[a] = i self._semtypes[i] = st # now that all individual semtype XML is loaded, we can link them together roots = [] for st in self.semtypes(): if st.superType: st.superType = self.semtype(st.superType.supID) st.superType.subTypes.append(st) else: if st not in roots: roots.append(st) st.rootType = st queue = list(roots) assert queue while queue: st = queue.pop(0) for child in st.subTypes: child.rootType = st.rootType queue.append(child) #self.propagate_semtypes() # apply inferencing over FE relations def propagate_semtypes(self): """ Apply inference rules to distribute semtypes over relations between FEs. For FrameNet 1.5, this results in 1011 semtypes being propagated. (Not done by default because it requires loading all frame files, which takes several seconds. If this needed to be fast, it could be rewritten to traverse the neighboring relations on demand for each FE semtype.) >>> from nltk.corpus import framenet as fn >>> sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) 4241 >>> fn.propagate_semtypes() >>> sum(1 for f in fn.frames() for fe in f.FE.values() if fe.semType) 5252 """ if not self._semtypes: self._loadsemtypes() if not self._ferel_idx: self._buildrelationindex() changed = True i = 0 nPropagations = 0 while changed: # make a pass and see if anything needs to be propagated i += 1 changed = False for ferel in self.fe_relations(): superST = ferel.superFE.semType subST = ferel.subFE.semType try: if superST and superST is not subST: # propagate downward assert subST is None or self.semtype_inherits(subST, superST),(superST.name,ferel,subST.name) if subST is None: ferel.subFE.semType = subST = superST changed = True nPropagations += 1 if ferel.type.name in ['Perspective_on', 'Subframe', 'Precedes'] and subST \ and subST is not superST: # propagate upward assert superST is None,(superST.name,ferel,subST.name) ferel.superFE.semType = superST = subST changed = True nPropagations += 1 except AssertionError as ex: # bug in the data! ignore #print(ex, file=sys.stderr) continue #print(i, nPropagations, file=sys.stderr) def semtype(self, key): """ >>> from nltk.corpus import framenet as fn >>> fn.semtype(233).name 'Temperature' >>> fn.semtype(233).abbrev 'Temp' >>> fn.semtype('Temperature').ID 233 :param key: The name, abbreviation, or id number of the semantic type :type key: string or int :return: Information about a semantic type :rtype: dict """ if isinstance(key, int): stid = key else: try: stid = self._semtypes[key] except TypeError: self._loadsemtypes() stid = self._semtypes[key] try: st = self._semtypes[stid] except TypeError: self._loadsemtypes() st = self._semtypes[stid] return st def semtype_inherits(self, st, superST): if not isinstance(st, dict): st = self.semtype(st) if not isinstance(superST, dict): superST = self.semtype(superST) par = st.superType while par: if par is superST: return True par = par.superType return False def frames(self, name=None): """ Obtain details for a specific frame. >>> from nltk.corpus import framenet as fn >>> len(fn.frames()) 1019 >>> PrettyList(fn.frames(r'(?i)medical'), maxReprSize=0, breakLines=True) [, , , ] A brief intro to Frames (excerpted from "FrameNet II: Extended Theory and Practice" by Ruppenhofer et. al., 2010): A Frame is a script-like conceptual structure that describes a particular type of situation, object, or event along with the participants and props that are needed for that Frame. For example, the "Apply_heat" frame describes a common situation involving a Cook, some Food, and a Heating_Instrument, and is evoked by words such as bake, blanch, boil, broil, brown, simmer, steam, etc. We call the roles of a Frame "frame elements" (FEs) and the frame-evoking words are called "lexical units" (LUs). FrameNet includes relations between Frames. Several types of relations are defined, of which the most important are: - Inheritance: An IS-A relation. The child frame is a subtype of the parent frame, and each FE in the parent is bound to a corresponding FE in the child. An example is the "Revenge" frame which inherits from the "Rewards_and_punishments" frame. - Using: The child frame presupposes the parent frame as background, e.g the "Speed" frame "uses" (or presupposes) the "Motion" frame; however, not all parent FEs need to be bound to child FEs. - Subframe: The child frame is a subevent of a complex event represented by the parent, e.g. the "Criminal_process" frame has subframes of "Arrest", "Arraignment", "Trial", and "Sentencing". - Perspective_on: The child frame provides a particular perspective on an un-perspectivized parent frame. A pair of examples consists of the "Hiring" and "Get_a_job" frames, which perspectivize the "Employment_start" frame from the Employer's and the Employee's point of view, respectively. :param name: A regular expression pattern used to match against Frame names. If 'name' is None, then a list of all Framenet Frames will be returned. :type name: str :return: A list of matching Frames (or all Frames). :rtype: list(AttrDict) """ try: fIDs = list(self._frame_idx.keys()) except AttributeError: self._buildframeindex() fIDs = list(self._frame_idx.keys()) if name is not None: return PrettyList(self.frame(fID) for fID,finfo in self.frame_ids_and_names(name).items()) else: return PrettyLazyMap(self.frame, fIDs) def frame_ids_and_names(self, name=None): """ Uses the frame index, which is much faster than looking up each frame definition if only the names and IDs are needed. """ if not self._frame_idx: self._buildframeindex() return dict((fID, finfo.name) for fID,finfo in self._frame_idx.items() if name is None or re.search(name, finfo.name) is not None) def fes(self, name=None): ''' Lists frame element objects. If 'name' is provided, this is treated as a case-insensitive regular expression to filter by frame name. (Case-insensitivity is because casing of frame element names is not always consistent across frames.) >>> from nltk.corpus import framenet as fn >>> fn.fes('Noise_maker') [] >>> sorted([(fe.frame.name,fe.name) for fe in fn.fes('sound')]) [('Cause_to_make_noise', 'Sound_maker'), ('Make_noise', 'Sound'), ('Make_noise', 'Sound_source'), ('Sound_movement', 'Location_of_sound_source'), ('Sound_movement', 'Sound'), ('Sound_movement', 'Sound_source'), ('Sounds', 'Component_sound'), ('Sounds', 'Location_of_sound_source'), ('Sounds', 'Sound_source'), ('Vocalizations', 'Location_of_sound_source'), ('Vocalizations', 'Sound_source')] >>> sorted(set(fe.name for fe in fn.fes('^sound'))) ['Sound', 'Sound_maker', 'Sound_source'] >>> len(fn.fes('^sound$')) 2 :param name: A regular expression pattern used to match against frame element names. If 'name' is None, then a list of all frame elements will be returned. :type name: str :return: A list of matching frame elements :rtype: list(AttrDict) ''' return PrettyList(fe for f in self.frames() for fename,fe in f.FE.items() if name is None or re.search(name, fename, re.I)) def lus(self, name=None): """ Obtain details for a specific lexical unit. >>> from nltk.corpus import framenet as fn >>> len(fn.lus()) 11829 >>> PrettyList(fn.lus(r'(?i)a little'), maxReprSize=0, breakLines=True) [, , ] A brief intro to Lexical Units (excerpted from "FrameNet II: Extended Theory and Practice" by Ruppenhofer et. al., 2010): A lexical unit (LU) is a pairing of a word with a meaning. For example, the "Apply_heat" Frame describes a common situation involving a Cook, some Food, and a Heating Instrument, and is _evoked_ by words such as bake, blanch, boil, broil, brown, simmer, steam, etc. These frame-evoking words are the LUs in the Apply_heat frame. Each sense of a polysemous word is a different LU. We have used the word "word" in talking about LUs. The reality is actually rather complex. When we say that the word "bake" is polysemous, we mean that the lemma "bake.v" (which has the word-forms "bake", "bakes", "baked", and "baking") is linked to three different frames: - Apply_heat: "Michelle baked the potatoes for 45 minutes." - Cooking_creation: "Michelle baked her mother a cake for her birthday." - Absorb_heat: "The potatoes have to bake for more than 30 minutes." These constitute three different LUs, with different definitions. Multiword expressions such as "given name" and hyphenated words like "shut-eye" can also be LUs. Idiomatic phrases such as "middle of nowhere" and "give the slip (to)" are also defined as LUs in the appropriate frames ("Isolated_places" and "Evading", respectively), and their internal structure is not analyzed. Framenet provides multiple annotated examples of each sense of a word (i.e. each LU). Moreover, the set of examples (approximately 20 per LU) illustrates all of the combinatorial possibilities of the lexical unit. Each LU is linked to a Frame, and hence to the other words which evoke that Frame. This makes the FrameNet database similar to a thesaurus, grouping together semantically similar words. In the simplest case, frame-evoking words are verbs such as "fried" in: "Matilde fried the catfish in a heavy iron skillet." Sometimes event nouns may evoke a Frame. For example, "reduction" evokes "Cause_change_of_scalar_position" in: "...the reduction of debt levels to $665 million from $2.6 billion." Adjectives may also evoke a Frame. For example, "asleep" may evoke the "Sleep" frame as in: "They were asleep for hours." Many common nouns, such as artifacts like "hat" or "tower", typically serve as dependents rather than clearly evoking their own frames. :param name: A regular expression pattern used to search the LU names. Note that LU names take the form of a dotted string (e.g. "run.v" or "a little.adv") in which a lemma preceeds the "." and a POS follows the dot. The lemma may be composed of a single lexeme (e.g. "run") or of multiple lexemes (e.g. "a little"). If 'name' is not given, then all LUs will be returned. The valid POSes are: v - verb n - noun a - adjective adv - adverb prep - preposition num - numbers intj - interjection art - article c - conjunction scon - subordinating conjunction :type name: str :return: A list of selected (or all) lexical units :rtype: list of LU objects (dicts). See the lu() function for info about the specifics of LU objects. """ try: luIDs = list(self._lu_idx.keys()) except AttributeError: self._buildluindex() luIDs = list(self._lu_idx.keys()) if name is not None: return PrettyList(self.lu(luID) for luID,luName in self.lu_ids_and_names(name).items()) else: return PrettyLazyMap(self.lu, luIDs) def lu_ids_and_names(self, name=None): """ Uses the LU index, which is much faster than looking up each LU definition if only the names and IDs are needed. """ if not self._lu_idx: self._buildluindex() return dict((luID, luinfo.name) for luID,luinfo in self._lu_idx.items() if name is None or re.search(name, luinfo.name) is not None) def documents(self, name=None): """ Return a list of the annotated documents in Framenet. Details for a specific annotated document can be obtained using this class's annotated_document() function and pass it the value of the 'ID' field. >>> from nltk.corpus import framenet as fn >>> len(fn.documents()) 78 >>> set([x.corpname for x in fn.documents()])==set(['ANC', 'C-4', 'KBEval', \ 'LUCorpus-v0.3', 'Miscellaneous', 'NTI', 'PropBank', 'QA', 'SemAnno']) True :param name: A regular expression pattern used to search the file name of each annotated document. The document's file name contains the name of the corpus that the document is from, followed by two underscores "__" followed by the document name. So, for example, the file name "LUCorpus-v0.3__20000410_nyt-NEW.xml" is from the corpus named "LUCorpus-v0.3" and the document name is "20000410_nyt-NEW.xml". :type name: str :return: A list of selected (or all) annotated documents :rtype: list of dicts, where each dict object contains the following keys: - 'name' - 'ID' - 'corpid' - 'corpname' - 'description' - 'filename' """ try: ftlist = PrettyList(self._fulltext_idx.values()) except AttributeError: self._buildcorpusindex() ftlist = PrettyList(self._fulltext_idx.values()) if name is None: return ftlist else: return PrettyList(x for x in ftlist if re.search(name, x['filename']) is not None) def frame_relation_types(self): """ Obtain a list of frame relation types. >>> from nltk.corpus import framenet as fn >>> frts = list(fn.frame_relation_types()) >>> isinstance(frts, list) True >>> len(frts) 9 >>> PrettyDict(frts[0], breakLines=True) {'ID': 1, '_type': 'framerelationtype', 'frameRelations': [ Child=Change_of_consistency>, Child=Rotting>, ...], 'name': 'Inheritance', 'subFrameName': 'Child', 'superFrameName': 'Parent'} :return: A list of all of the frame relation types in framenet :rtype: list(dict) """ if not self._freltyp_idx: self._buildrelationindex() return self._freltyp_idx.values() def frame_relations(self, frame=None, frame2=None, type=None): """ :param frame: (optional) frame object, name, or ID; only relations involving this frame will be returned :param frame2: (optional; 'frame' must be a different frame) only show relations between the two specified frames, in either direction :param type: (optional) frame relation type (name or object); show only relations of this type :type frame: int or str or AttrDict :return: A list of all of the frame relations in framenet :rtype: list(dict) >>> from nltk.corpus import framenet as fn >>> frels = fn.frame_relations() >>> isinstance(frels, list) True >>> len(frels) 1676 >>> PrettyList(fn.frame_relations('Cooking_creation'), maxReprSize=0, breakLines=True) [ Child=Cooking_creation>, Child=Cooking_creation>, ReferringEntry=Cooking_creation>] >>> PrettyList(fn.frame_relations(373), breakLines=True) [ Child=Communication>, Target=Topic>, ...] >>> PrettyList(fn.frame_relations(fn.frame('Cooking_creation')), breakLines=True) [ Child=Cooking_creation>, Child=Cooking_creation>, ...] >>> PrettyList(fn.frame_relations('Cooking_creation', type='Inheritance')) [ Child=Cooking_creation>] >>> PrettyList(fn.frame_relations('Cooking_creation', 'Apply_heat'), breakLines=True) [ Child=Cooking_creation>, ReferringEntry=Cooking_creation>] """ relation_type = type if not self._frel_idx: self._buildrelationindex() rels = None if relation_type is not None: if not isinstance(relation_type, dict): type = [rt for rt in self.frame_relation_types() if rt.name==type][0] assert isinstance(type,dict) # lookup by 'frame' if frame is not None: if isinstance(frame,dict) and 'frameRelations' in frame: rels = PrettyList(frame.frameRelations) else: if not isinstance(frame, int): if isinstance(frame, dict): frame = frame.ID else: frame = self.frame_by_name(frame).ID rels = [self._frel_idx[frelID] for frelID in self._frel_f_idx[frame]] # filter by 'type' if type is not None: rels = [rel for rel in rels if rel.type is type] elif type is not None: # lookup by 'type' rels = type.frameRelations else: rels = self._frel_idx.values() # filter by 'frame2' if frame2 is not None: if frame is None: raise FramenetError("frame_relations(frame=None, frame2=) is not allowed") if not isinstance(frame2, int): if isinstance(frame2, dict): frame2 = frame2.ID else: frame2 = self.frame_by_name(frame2).ID if frame==frame2: raise FramenetError("The two frame arguments to frame_relations() must be different frames") rels = [rel for rel in rels if rel.superFrame.ID==frame2 or rel.subFrame.ID==frame2] return PrettyList(sorted(rels, key=lambda frel: (frel.type.ID, frel.superFrameName, frel.subFrameName))) def fe_relations(self): """ Obtain a list of frame element relations. >>> from nltk.corpus import framenet as fn >>> ferels = fn.fe_relations() >>> isinstance(ferels, list) True >>> len(ferels) 10020 >>> PrettyDict(ferels[0], breakLines=True) {'ID': 14642, '_type': 'ferelation', 'frameRelation': Child=Lively_place>, 'subFE': , 'subFEName': 'Degree', 'subFrame': , 'subID': 11370, 'supID': 2271, 'superFE': , 'superFEName': 'Degree', 'superFrame': , 'type': } :return: A list of all of the frame element relations in framenet :rtype: list(dict) """ if not self._ferel_idx: self._buildrelationindex() return PrettyList(sorted(self._ferel_idx.values(), key=lambda ferel: (ferel.type.ID, ferel.frameRelation.superFrameName, ferel.superFEName, ferel.frameRelation.subFrameName, ferel.subFEName))) def semtypes(self): """ Obtain a list of semantic types. >>> from nltk.corpus import framenet as fn >>> stypes = fn.semtypes() >>> len(stypes) 73 >>> sorted(stypes[0].keys()) ['ID', '_type', 'abbrev', 'definition', 'name', 'rootType', 'subTypes', 'superType'] :return: A list of all of the semantic types in framenet :rtype: list(dict) """ if not self._semtypes: self._loadsemtypes() return PrettyList(self._semtypes[i] for i in self._semtypes if isinstance(i, int)) def _load_xml_attributes(self, d, elt): """ Extracts a subset of the attributes from the given element and returns them in a dictionary. :param d: A dictionary in which to store the attributes. :type d: dict :param elt: An ElementTree Element :type elt: Element :return: Returns the input dict ``d`` possibly including attributes from ``elt`` :rtype: dict """ d = type(d)(d) try: attr_dict = elt.attrib except AttributeError: return d if attr_dict is None: return d # Ignore these attributes when loading attributes from an xml node ignore_attrs = ['cBy', 'cDate', 'mDate', 'xsi', 'schemaLocation', 'xmlns', 'bgColor', 'fgColor'] for attr in attr_dict: if any(attr.endswith(x) for x in ignore_attrs): continue val = attr_dict[attr] if val.isdigit(): d[attr] = int(val) else: d[attr] = val return d def _strip_tags(self, data): """ Gets rid of all tags and newline characters from the given input :return: A cleaned-up version of the input string :rtype: str """ try: data = data.replace('', '') data = data.replace('', '') data = re.sub('', '', data) data = data.replace('', '') data = data.replace('', '') data = data.replace('', '') data = data.replace('', '') data = data.replace('', '') data = data.replace('', '') data = data.replace('', '') data = data.replace('', "'") data = data.replace('', "'") data = data.replace('', '') data = data.replace('', '') data = data.replace('', '') data = data.replace('', '') # Get rid of and tags data = data.replace('', '') data = data.replace('', '') data = data.replace('\n', ' ') except AttributeError: pass return data def _handle_elt(self, elt, tagspec=None): """Extracts and returns the attributes of the given element""" return self._load_xml_attributes(AttrDict(), elt) def _handle_fulltextindex_elt(self, elt, tagspec=None): """ Extracts corpus/document info from the fulltextIndex.xml file. Note that this function "flattens" the information contained in each of the "corpus" elements, so that each "document" element will contain attributes for the corpus and corpusid. Also, each of the "document" items will contain a new attribute called "filename" that is the base file name of the xml file for the document in the "fulltext" subdir of the Framenet corpus. """ ftinfo = self._load_xml_attributes(AttrDict(), elt) corpname = ftinfo.name corpid = ftinfo.ID retlist = [] for sub in elt: if sub.tag.endswith('document'): doc = self._load_xml_attributes(AttrDict(), sub) if 'name' in doc: docname = doc.name else: docname = doc.description doc.filename = "{0}__{1}.xml".format(corpname, docname) doc.corpname = corpname doc.corpid = corpid retlist.append(doc) return retlist def _handle_frame_elt(self, elt, ignorekeys=[]): """Load the info for a Frame from an frame xml file""" frinfo = self._load_xml_attributes(AttrDict(), elt) frinfo['_type'] = 'frame' frinfo['definition'] = "" frinfo['FE'] = PrettyDict() frinfo['FEcoreSets'] = [] frinfo['lexUnit'] = PrettyDict() frinfo['semTypes'] = [] for k in ignorekeys: if k in frinfo: del frinfo[k] for sub in elt: if sub.tag.endswith('definition') and 'definition' not in ignorekeys: frinfo['definition'] = self._strip_tags(sub.text) elif sub.tag.endswith('FE') and 'FE' not in ignorekeys: feinfo = self._handle_fe_elt(sub) frinfo['FE'][feinfo.name] = feinfo feinfo['frame'] = frinfo # backpointer elif sub.tag.endswith('FEcoreSet') and 'FEcoreSet' not in ignorekeys: coreset = self._handle_fecoreset_elt(sub) # assumes all FEs have been loaded before coresets frinfo['FEcoreSets'].append(PrettyList(frinfo['FE'][fe.name] for fe in coreset)) elif sub.tag.endswith('lexUnit') and 'lexUnit' not in ignorekeys: luentry = self._handle_framelexunit_elt(sub) if luentry['status'] in self._bad_statuses: # problematic LU entry; ignore it continue luentry['frame'] = frinfo luentry['subCorpus'] = Future((lambda lu: lambda: self._lu_file(lu))(luentry)) frinfo['lexUnit'][luentry.name] = luentry if not self._lu_idx: self._buildluindex() self._lu_idx[luentry.ID] = luentry elif sub.tag.endswith('semType') and 'semTypes' not in ignorekeys: semtypeinfo = self._load_xml_attributes(AttrDict(), sub) frinfo['semTypes'].append(self.semtype(semtypeinfo.ID)) frinfo['frameRelations'] = self.frame_relations(frame=frinfo) # resolve 'requires' and 'excludes' links between FEs of this frame for fe in frinfo.FE.values(): if fe.requiresFE: name, ID = fe.requiresFE.name, fe.requiresFE.ID fe.requiresFE = frinfo.FE[name] assert fe.requiresFE.ID==ID if fe.excludesFE: name, ID = fe.excludesFE.name, fe.excludesFE.ID fe.excludesFE = frinfo.FE[name] assert fe.excludesFE.ID==ID return frinfo def _handle_fecoreset_elt(self, elt): """Load fe coreset info from xml.""" info = self._load_xml_attributes(AttrDict(), elt) tmp = [] for sub in elt: tmp.append(self._load_xml_attributes(AttrDict(), sub)) return tmp def _handle_framerelationtype_elt(self, elt, *args): """Load frame-relation element and its child fe-relation elements from frRelation.xml.""" info = self._load_xml_attributes(AttrDict(), elt) info['_type'] = 'framerelationtype' info['frameRelations'] = PrettyList() for sub in elt: if sub.tag.endswith('frameRelation'): frel = self._handle_framerelation_elt(sub) frel['type'] = info # backpointer for ferel in frel.feRelations: ferel['type'] = info info['frameRelations'].append(frel) return info def _handle_framerelation_elt(self, elt): """Load frame-relation element and its child fe-relation elements from frRelation.xml.""" info = self._load_xml_attributes(AttrDict(), elt) assert info['superFrameName']!=info['subFrameName'],(elt,info) info['_type'] = 'framerelation' info['feRelations'] = PrettyList() for sub in elt: if sub.tag.endswith('FERelation'): ferel = self._handle_elt(sub) ferel['_type'] = 'ferelation' ferel['frameRelation'] = info # backpointer info['feRelations'].append(ferel) return info def _handle_fulltextannotation_elt(self, elt): """Load full annotation info for a document from its xml file. The main element (fullTextAnnotation) contains a 'header' element (which we ignore here) and a bunch of 'sentence' elements.""" info = AttrDict() info['_type'] = 'fulltextannotation' info['sentence'] = [] for sub in elt: if sub.tag.endswith('header'): continue # not used elif sub.tag.endswith('sentence'): s = self._handle_fulltext_sentence_elt(sub) info['sentence'].append(s) return info def _handle_fulltext_sentence_elt(self, elt): """Load information from the given 'sentence' element. Each 'sentence' element contains a "text" and an "annotationSet" sub element.""" info = self._load_xml_attributes(AttrDict(), elt) info['_type'] = "sentence" info['annotationSet'] = [] info['text'] = "" for sub in elt: if sub.tag.endswith('text'): info['text'] = self._strip_tags(sub.text) elif sub.tag.endswith('annotationSet'): a = self._handle_fulltextannotationset_elt(sub) info['annotationSet'].append(a) return info def _handle_fulltextannotationset_elt(self, elt): """Load information from the given 'annotationSet' element. Each 'annotationSet' contains several "layer" elements.""" info = self._load_xml_attributes(AttrDict(), elt) info['_type'] = "annotationset" info['layer'] = [] for sub in elt: if sub.tag.endswith('layer'): l = self._handle_fulltextlayer_elt(sub) info['layer'].append(l) return info def _handle_fulltextlayer_elt(self, elt): """Load information from the given 'layer' element. Each 'layer' contains several "label" elements.""" info = self._load_xml_attributes(AttrDict(), elt) info['_type'] = 'layer' info['label'] = [] for sub in elt: if sub.tag.endswith('label'): l = self._load_xml_attributes(AttrDict(), sub) info['label'].append(l) return info def _handle_framelexunit_elt(self, elt): """Load the lexical unit info from an xml element in a frame's xml file.""" luinfo = AttrDict() luinfo['_type'] = 'lu' luinfo = self._load_xml_attributes(luinfo, elt) luinfo["definition"] = "" luinfo["sentenceCount"] = PrettyDict() luinfo['lexemes'] = PrettyList() # multiword LUs have multiple lexemes luinfo['semTypes'] = PrettyList() # an LU can have multiple semtypes for sub in elt: if sub.tag.endswith('definition'): luinfo['definition'] = self._strip_tags(sub.text) elif sub.tag.endswith('sentenceCount'): luinfo['sentenceCount'] = self._load_xml_attributes( PrettyDict(), sub) elif sub.tag.endswith('lexeme'): luinfo['lexemes'].append(self._load_xml_attributes(PrettyDict(), sub)) elif sub.tag.endswith('semType'): semtypeinfo = self._load_xml_attributes(PrettyDict(), sub) luinfo['semTypes'].append(self.semtype(semtypeinfo.ID)) return luinfo def _handle_lexunit_elt(self, elt, ignorekeys): """ Load full info for a lexical unit from its xml file. This should only be called when accessing corpus annotations (which are not included in frame files). """ luinfo = self._load_xml_attributes(AttrDict(), elt) luinfo['_type'] = 'lu' luinfo['definition'] = "" luinfo['subCorpus'] = PrettyList() luinfo['lexemes'] = PrettyList() # multiword LUs have multiple lexemes luinfo['semTypes'] = PrettyList() # an LU can have multiple semtypes for k in ignorekeys: if k in luinfo: del luinfo[k] for sub in elt: if sub.tag.endswith('header'): continue # not used elif sub.tag.endswith('valences'): continue # not used elif sub.tag.endswith('definition') and 'definition' not in ignorekeys: luinfo['definition'] = self._strip_tags(sub.text) elif sub.tag.endswith('subCorpus') and 'subCorpus' not in ignorekeys: sc = self._handle_lusubcorpus_elt(sub) if sc is not None: luinfo['subCorpus'].append(sc) elif sub.tag.endswith('lexeme') and 'lexeme' not in ignorekeys: luinfo['lexemes'].append(self._load_xml_attributes(PrettyDict(), sub)) elif sub.tag.endswith('semType') and 'semType' not in ignorekeys: semtypeinfo = self._load_xml_attributes(AttrDict(), sub) luinfo['semTypes'].append(self.semtype(semtypeinfo.ID)) return luinfo def _handle_lusubcorpus_elt(self, elt): """Load a subcorpus of a lexical unit from the given xml.""" sc = AttrDict() try: sc['name'] = str(elt.get('name')) except AttributeError: return None sc['_type'] = "lusubcorpus" sc['sentence'] = [] for sub in elt: if sub.tag.endswith('sentence'): s = self._handle_lusentence_elt(sub) if s is not None: sc['sentence'].append(s) return sc def _handle_lusentence_elt(self, elt): """Load a sentence from a subcorpus of an LU from xml.""" info = self._load_xml_attributes(AttrDict(), elt) info['_type'] = 'lusentence' info['annotationSet'] = [] for sub in elt: if sub.tag.endswith('text'): info['text'] = self._strip_tags(sub.text) elif sub.tag.endswith('annotationSet'): annset = self._handle_luannotationset_elt(sub) if annset is not None: info['annotationSet'].append(annset) return info def _handle_luannotationset_elt(self, elt): """Load an annotation set from a sentence in an subcorpus of an LU""" info = self._load_xml_attributes(AttrDict(), elt) info['_type'] = 'luannotationset' info['layer'] = [] for sub in elt: if sub.tag.endswith('layer'): l = self._handle_lulayer_elt(sub) if l is not None: info['layer'].append(l) return info def _handle_lulayer_elt(self, elt): """Load a layer from an annotation set""" layer = self._load_xml_attributes(AttrDict(), elt) layer['_type'] = 'lulayer' layer['label'] = [] for sub in elt: if sub.tag.endswith('label'): l = self._load_xml_attributes(AttrDict(), sub) if l is not None: layer['label'].append(l) return layer def _handle_fe_elt(self, elt): feinfo = self._load_xml_attributes(AttrDict(), elt) feinfo['_type'] = 'fe' feinfo['definition'] = "" feinfo['semType'] = None feinfo['requiresFE'] = None feinfo['excludesFE'] = None for sub in elt: if sub.tag.endswith('definition'): feinfo['definition'] = self._strip_tags(sub.text) elif sub.tag.endswith('semType'): stinfo = self._load_xml_attributes(AttrDict(), sub) feinfo['semType'] = self.semtype(stinfo.ID) elif sub.tag.endswith('requiresFE'): feinfo['requiresFE'] = self._load_xml_attributes(AttrDict(), sub) elif sub.tag.endswith('excludesFE'): feinfo['excludesFE'] = self._load_xml_attributes(AttrDict(), sub) return feinfo def _handle_semtype_elt(self, elt, tagspec=None): semt = self._load_xml_attributes(AttrDict(), elt) semt['_type'] = 'semtype' semt['superType'] = None semt['subTypes'] = PrettyList() for sub in elt: if sub.text is not None: semt['definition'] = self._strip_tags(sub.text) else: supertypeinfo = self._load_xml_attributes(AttrDict(), sub) semt['superType'] = supertypeinfo # the supertype may not have been loaded yet return semt # # Demo # def demo(): from nltk.corpus import framenet as fn # # It is not necessary to explicitly build the indexes by calling # buildindexes(). We do this here just for demo purposes. If the # indexes are not built explicitely, they will be built as needed. # print('Building the indexes...') fn.buildindexes() # # Get some statistics about the corpus # print('Number of Frames:', len(fn.frames())) print('Number of Lexical Units:', len(fn.lus())) print('Number of annotated documents:', len(fn.documents())) print() # # Frames # print('getting frames whose name matches the (case insensitive) regex: "(?i)medical"') medframes = fn.frames(r'(?i)medical') print( 'Found {0} Frames whose name matches "(?i)medical":'.format(len(medframes))) print([(f.name, f.ID) for f in medframes]) # # store the first frame in the list of frames # tmp_id = medframes[0].ID m_frame = fn.frame(tmp_id) # reads all info for the frame # # get the frame relations # print( '\nNumber of frame relations for the "{0}" ({1}) frame:'.format(m_frame.name, m_frame.ID), len(m_frame.frameRelations)) for fr in m_frame.frameRelations: print(' ', fr) # # get the names of the Frame Elements # print( '\nNumber of Frame Elements in the "{0}" frame:'.format(m_frame.name), len(m_frame.FE)) print(' ', [x for x in m_frame.FE]) # # get the names of the "Core" Frame Elements # print( '\nThe "core" Frame Elements in the "{0}" frame:'.format(m_frame.name)) print(' ', [x.name for x in m_frame.FE.values() if x.coreType == "Core"]) # # get all of the Lexical Units that are incorporated in the # 'Ailment' FE of the 'Medical_conditions' frame (id=239) # print('\nAll Lexical Units that are incorporated in the "Ailment" FE:') m_frame = fn.frame(239) ailment_lus = [x for x in m_frame.lexUnit.values() if 'incorporatedFE' in x and x.incorporatedFE == 'Ailment'] print(' ', [x.name for x in ailment_lus]) # # get all of the Lexical Units for the frame # print('\nNumber of Lexical Units in the "{0}" frame:'.format(m_frame.name), len(m_frame.lexUnit)) print(' ', [x.name for x in m_frame.lexUnit.values()][:5], '...') # # get basic info on the second LU in the frame # tmp_id = m_frame.lexUnit['ailment.n'].ID # grab the id of the specified LU luinfo = fn.lu_basic(tmp_id) # get basic info on the LU print('\nInformation on the LU: {0}'.format(luinfo.name)) pprint(luinfo) # # Get a list of all of the corpora used for fulltext annotation # print('\nNames of all of the corpora used for fulltext annotation:') allcorpora = set([x.corpname for x in fn.documents()]) pprint(list(allcorpora)) # # Get the names of the annotated documents in the first corpus # firstcorp = list(allcorpora)[0] firstcorp_docs = fn.documents(firstcorp) print( '\nNames of the annotated documents in the "{0}" corpus:'.format(firstcorp)) pprint([x.filename for x in firstcorp_docs]) # # Search for frames containing LUs whose name attribute matches a # regexp pattern. # # Note: if you were going to be doing a lot of this type of # searching, you'd want to build an index that maps from # lemmas to frames because each time frames_by_lemma() is # called, it has to search through ALL of the frame XML files # in the db. print('\nSearching for all Frames that have a lemma that matches the regexp: "^run.v$":') pprint(fn.frames_by_lemma(r'^run.v$')) if __name__ == '__main__': demo() nltk-3.1/nltk/corpus/reader/ieer.py0000644000076500000240000000762212607224144017110 0ustar sbstaff00000000000000# Natural Language Toolkit: IEER Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Corpus reader for the Information Extraction and Entity Recognition Corpus. NIST 1999 Information Extraction: Entity Recognition Evaluation http://www.itl.nist.gov/iad/894.01/tests/ie-er/er_99/er_99.htm This corpus contains the NEWSWIRE development test data for the NIST 1999 IE-ER Evaluation. The files were taken from the subdirectory: /ie_er_99/english/devtest/newswire/*.ref.nwt and filenames were shortened. The corpus contains the following files: APW_19980314, APW_19980424, APW_19980429, NYT_19980315, NYT_19980403, and NYT_19980407. """ from __future__ import unicode_literals import nltk from nltk import compat from nltk.corpus.reader.api import * #: A dictionary whose keys are the names of documents in this corpus; #: and whose values are descriptions of those documents' contents. titles = { 'APW_19980314': 'Associated Press Weekly, 14 March 1998', 'APW_19980424': 'Associated Press Weekly, 24 April 1998', 'APW_19980429': 'Associated Press Weekly, 29 April 1998', 'NYT_19980315': 'New York Times, 15 March 1998', 'NYT_19980403': 'New York Times, 3 April 1998', 'NYT_19980407': 'New York Times, 7 April 1998', } #: A list of all documents in this corpus. documents = sorted(titles) @compat.python_2_unicode_compatible class IEERDocument(object): def __init__(self, text, docno=None, doctype=None, date_time=None, headline=''): self.text = text self.docno = docno self.doctype = doctype self.date_time = date_time self.headline = headline def __repr__(self): if self.headline: headline = ' '.join(self.headline.leaves()) else: headline = ' '.join([w for w in self.text.leaves() if w[:1] != '<'][:12])+'...' if self.docno is not None: return '' % (self.docno, headline) else: return '' % headline class IEERCorpusReader(CorpusReader): """ """ def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def docs(self, fileids=None): return concat([StreamBackedCorpusView(fileid, self._read_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)]) def parsed_docs(self, fileids=None): return concat([StreamBackedCorpusView(fileid, self._read_parsed_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)]) def _read_parsed_block(self,stream): # TODO: figure out while empty documents are being returned return [self._parse(doc) for doc in self._read_block(stream) if self._parse(doc).docno is not None] def _parse(self, doc): val = nltk.chunk.ieerstr2tree(doc, root_label="DOCUMENT") if isinstance(val, dict): return IEERDocument(**val) else: return IEERDocument(val) def _read_block(self, stream): out = [] # Skip any preamble. while True: line = stream.readline() if not line: break if line.strip() == '': break out.append(line) # Read the document while True: line = stream.readline() if not line: break out.append(line) if line.strip() == '': break # Return the document return ['\n'.join(out)] nltk-3.1/nltk/corpus/reader/indian.py0000644000076500000240000000607512607224144017427 0ustar sbstaff00000000000000# Natural Language Toolkit: Indian Language POS-Tagged Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Indian Language POS-Tagged Corpus Collected by A Kumaran, Microsoft Research, India Distributed with permission Contents: - Bangla: IIT Kharagpur - Hindi: Microsoft Research India - Marathi: IIT Bombay - Telugu: IIIT Hyderabad """ from nltk import compat from nltk.tag import str2tuple, map_tag from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class IndianCorpusReader(CorpusReader): """ List of words, one per line. Blank lines are ignored. """ def words(self, fileids=None): return concat([IndianCorpusView(fileid, enc, False, False) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_words(self, fileids=None, tagset=None): if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat([IndianCorpusView(fileid, enc, True, False, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True)]) def sents(self, fileids=None): return concat([IndianCorpusView(fileid, enc, False, True) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_sents(self, fileids=None, tagset=None): if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat([IndianCorpusView(fileid, enc, True, True, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True)]) def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) class IndianCorpusView(StreamBackedCorpusView): def __init__(self, corpus_file, encoding, tagged, group_by_sent, tag_mapping_function=None): self._tagged = tagged self._group_by_sent = group_by_sent self._tag_mapping_function = tag_mapping_function StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): line = stream.readline() if line.startswith('<'): return [] sent = [str2tuple(word, sep='_') for word in line.split()] if self._tag_mapping_function: sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent] if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: return [sent] else: return sent nltk-3.1/nltk/corpus/reader/ipipan.py0000644000076500000240000003065512607224144017446 0ustar sbstaff00000000000000# Natural Language Toolkit: IPI PAN Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Konrad Goluchowski # URL: # For license information, see LICENSE.TXT import functools from nltk import compat from nltk.corpus.reader.util import StreamBackedCorpusView, concat from nltk.corpus.reader.api import CorpusReader def _parse_args(fun): @functools.wraps(fun) def decorator(self, fileids=None, **kwargs): kwargs.pop('tags', None) if not fileids: fileids = self.fileids() return fun(self, fileids, **kwargs) return decorator class IPIPANCorpusReader(CorpusReader): """ Corpus reader designed to work with corpus created by IPI PAN. See http://korpus.pl/en/ for more details about IPI PAN corpus. The corpus includes information about text domain, channel and categories. You can access possible values using ``domains()``, ``channels()`` and ``categories()``. You can use also this metadata to filter files, e.g.: ``fileids(channel='prasa')``, ``fileids(categories='publicystyczny')``. The reader supports methods: words, sents, paras and their tagged versions. You can get part of speech instead of full tag by giving "simplify_tags=True" parameter, e.g.: ``tagged_sents(simplify_tags=True)``. Also you can get all tags disambiguated tags specifying parameter "one_tag=False", e.g.: ``tagged_paras(one_tag=False)``. You can get all tags that were assigned by a morphological analyzer specifying parameter "disamb_only=False", e.g. ``tagged_words(disamb_only=False)``. The IPIPAN Corpus contains tags indicating if there is a space between two tokens. To add special "no space" markers, you should specify parameter "append_no_space=True", e.g. ``tagged_words(append_no_space=True)``. As a result in place where there should be no space between two tokens new pair ('', 'no-space') will be inserted (for tagged data) and just '' for methods without tags. The corpus reader can also try to append spaces between words. To enable this option, specify parameter "append_space=True", e.g. ``words(append_space=True)``. As a result either ' ' or (' ', 'space') will be inserted between tokens. By default, xml entities like " and & are replaced by corresponding characters. You can turn off this feature, specifying parameter "replace_xmlentities=False", e.g. ``words(replace_xmlentities=False)``. """ def __init__(self, root, fileids): CorpusReader.__init__(self, root, fileids, None, None) def raw(self, fileids=None): if not fileids: fileids = self.fileids() filecontents = [] for fileid in self._list_morph_files(fileids): with open(fileid, 'r') as infile: filecontents.append(infile.read()) return ''.join(filecontents) def channels(self, fileids=None): if not fileids: fileids = self.fileids() return self._parse_header(fileids, 'channel') def domains(self, fileids=None): if not fileids: fileids = self.fileids() return self._parse_header(fileids, 'domain') def categories(self, fileids=None): if not fileids: fileids = self.fileids() return [self._map_category(cat) for cat in self._parse_header(fileids, 'keyTerm')] def fileids(self, channels=None, domains=None, categories=None): if channels is not None and domains is not None and \ categories is not None: raise ValueError('You can specify only one of channels, domains ' 'and categories parameter at once') if channels is None and domains is None and \ categories is None: return CorpusReader.fileids(self) if isinstance(channels, compat.string_types): channels = [channels] if isinstance(domains, compat.string_types): domains = [domains] if isinstance(categories, compat.string_types): categories = [categories] if channels: return self._list_morph_files_by('channel', channels) elif domains: return self._list_morph_files_by('domain', domains) else: return self._list_morph_files_by('keyTerm', categories, map=self._map_category) @_parse_args def sents(self, fileids=None, **kwargs): return concat([self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, tags=False, **kwargs) for fileid in self._list_morph_files(fileids)]) @_parse_args def paras(self, fileids=None, **kwargs): return concat([self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, tags=False, **kwargs) for fileid in self._list_morph_files(fileids)]) @_parse_args def words(self, fileids=None, **kwargs): return concat([self._view(fileid, tags=False, **kwargs) for fileid in self._list_morph_files(fileids)]) @_parse_args def tagged_sents(self, fileids=None, **kwargs): return concat([self._view(fileid, mode=IPIPANCorpusView.SENTS_MODE, **kwargs) for fileid in self._list_morph_files(fileids)]) @_parse_args def tagged_paras(self, fileids=None, **kwargs): return concat([self._view(fileid, mode=IPIPANCorpusView.PARAS_MODE, **kwargs) for fileid in self._list_morph_files(fileids)]) @_parse_args def tagged_words(self, fileids=None, **kwargs): return concat([self._view(fileid, **kwargs) for fileid in self._list_morph_files(fileids)]) def _list_morph_files(self, fileids): return [f for f in self.abspaths(fileids)] def _list_header_files(self, fileids): return [f.replace('morph.xml', 'header.xml') for f in self._list_morph_files(fileids)] def _parse_header(self, fileids, tag): values = set() for f in self._list_header_files(fileids): values_list = self._get_tag(f, tag) for v in values_list: values.add(v) return list(values) def _list_morph_files_by(self, tag, values, map=None): fileids = self.fileids() ret_fileids = set() for f in fileids: fp = self.abspath(f).replace('morph.xml', 'header.xml') values_list = self._get_tag(fp, tag) for value in values_list: if map is not None: value = map(value) if value in values: ret_fileids.add(f) return list(ret_fileids) def _get_tag(self, f, tag): tags = [] with open(f, 'r') as infile: header = infile.read() tag_end = 0 while True: tag_pos = header.find('<'+tag, tag_end) if tag_pos < 0: return tags tag_end = header.find('', tag_pos) tags.append(header[tag_pos+len(tag)+2:tag_end]) def _map_category(self, cat): pos = cat.find('>') if pos == -1: return cat else: return cat[pos+1:] def _view(self, filename, **kwargs): tags = kwargs.pop('tags', True) mode = kwargs.pop('mode', 0) simplify_tags = kwargs.pop('simplify_tags', False) one_tag = kwargs.pop('one_tag', True) disamb_only = kwargs.pop('disamb_only', True) append_no_space = kwargs.pop('append_no_space', False) append_space = kwargs.pop('append_space', False) replace_xmlentities = kwargs.pop('replace_xmlentities', True) if len(kwargs) > 0: raise ValueError('Unexpected arguments: %s' % kwargs.keys()) if not one_tag and not disamb_only: raise ValueError('You cannot specify both one_tag=False and ' 'disamb_only=False') if not tags and (simplify_tags or not one_tag or not disamb_only): raise ValueError('You cannot specify simplify_tags, one_tag or ' 'disamb_only with functions other than tagged_*') return IPIPANCorpusView(filename, tags=tags, mode=mode, simplify_tags=simplify_tags, one_tag=one_tag, disamb_only=disamb_only, append_no_space=append_no_space, append_space=append_space, replace_xmlentities=replace_xmlentities ) class IPIPANCorpusView(StreamBackedCorpusView): WORDS_MODE = 0 SENTS_MODE = 1 PARAS_MODE = 2 def __init__(self, filename, startpos=0, **kwargs): StreamBackedCorpusView.__init__(self, filename, None, startpos, None) self.in_sentence = False self.position = 0 self.show_tags = kwargs.pop('tags', True) self.disamb_only = kwargs.pop('disamb_only', True) self.mode = kwargs.pop('mode', IPIPANCorpusView.WORDS_MODE) self.simplify_tags = kwargs.pop('simplify_tags', False) self.one_tag = kwargs.pop('one_tag', True) self.append_no_space = kwargs.pop('append_no_space', False) self.append_space = kwargs.pop('append_space', False) self.replace_xmlentities = kwargs.pop('replace_xmlentities', True) def read_block(self, stream): sentence = [] sentences = [] space = False no_space = False tags = set() lines = self._read_data(stream) while True: # we may have only part of last line if len(lines) <= 1: self._seek(stream) lines = self._read_data(stream) if lines == ['']: assert not sentences return [] line = lines.pop() self.position += len(line) + 1 if line.startswith(''): if self.append_space: no_space = True if self.append_no_space: if self.show_tags: sentence.append(('', 'no-space')) else: sentence.append('') elif line.startswith(' # URL: # For license information, see LICENSE.TXT # For more information, see http://lilyx.net/pages/nltkjapanesecorpus.html from __future__ import print_function import re from nltk.compat import string_types from nltk.parse import DependencyGraph from nltk.corpus.reader.util import ( FileSystemPathPointer, find_corpus_fileids, read_blankline_block, ) from nltk.corpus.reader.api import SyntaxCorpusReader, CorpusReader # default function to convert morphlist to str for tree representation _morphs2str_default = lambda morphs: '/'.join(m[0] for m in morphs if m[0] != 'EOS') class KNBCorpusReader(SyntaxCorpusReader): """ This class implements: - ``__init__``, which specifies the location of the corpus and a method for detecting the sentence blocks in corpus files. - ``_read_block``, which reads a block from the input stream. - ``_word``, which takes a block and returns a list of list of words. - ``_tag``, which takes a block and returns a list of list of tagged words. - ``_parse``, which takes a block and returns a list of parsed sentences. The structure of tagged words: tagged_word = (word(str), tags(tuple)) tags = (surface, reading, lemma, pos1, posid1, pos2, posid2, pos3, posid3, others ...) Usage example ------------- >>> from nltk.corpus.util import LazyCorpusLoader >>> knbc = LazyCorpusLoader( ... 'knbc/corpus1', ... KNBCorpusReader, ... r'.*/KN.*', ... encoding='euc-jp', ... ) >>> len(knbc.sents()[0]) 9 """ def __init__(self, root, fileids, encoding='utf8', morphs2str=_morphs2str_default): """ Initialize KNBCorpusReader morphs2str is a function to convert morphlist to str for tree representation for _parse() """ CorpusReader.__init__(self, root, fileids, encoding) self.morphs2str = morphs2str def _read_block(self, stream): # blocks are split by blankline (or EOF) - default return read_blankline_block(stream) def _word(self, t): res = [] for line in t.splitlines(): # ignore the Bunsets headers if not re.match(r"EOS|\*|\#|\+", line): cells = line.strip().split(" ") res.append(cells[0]) return res # ignores tagset argument def _tag(self, t, tagset=None): res = [] for line in t.splitlines(): # ignore the Bunsets headers if not re.match(r"EOS|\*|\#|\+", line): cells = line.strip().split(" ") # convert cells to morph tuples res.append((cells[0], ' '.join(cells[1:]))) return res def _parse(self, t): dg = DependencyGraph() i = 0 for line in t.splitlines(): if line[0] in '*+': # start of bunsetsu or tag cells = line.strip().split(" ", 3) m = re.match(r"([\-0-9]*)([ADIP])", cells[1]) assert m is not None node = dg.nodes[i] node.update( { 'address': i, 'rel': m.group(2), 'word': [], } ) dep_parent = int(m.group(1)) if dep_parent == -1: dg.root = node else: dg.nodes[dep_parent]['deps'].append(i) i += 1 elif line[0] != '#': # normal morph cells = line.strip().split(" ") # convert cells to morph tuples morph = cells[0], ' '.join(cells[1:]) dg.nodes[i - 1]['word'].append(morph) if self.morphs2str: for node in dg.nodes.values(): node['word'] = self.morphs2str(node['word']) return dg.tree() ###################################################################### # Demo ###################################################################### def demo(): import nltk from nltk.corpus.util import LazyCorpusLoader root = nltk.data.find('corpora/knbc/corpus1') fileids = [f for f in find_corpus_fileids(FileSystemPathPointer(root), ".*") if re.search(r"\d\-\d\-[\d]+\-[\d]+", f)] def _knbc_fileids_sort(x): cells = x.split('-') return (cells[0], int(cells[1]), int(cells[2]), int(cells[3])) knbc = LazyCorpusLoader('knbc/corpus1', KNBCorpusReader, sorted(fileids, key=_knbc_fileids_sort), encoding='euc-jp') print(knbc.fileids()[:10]) print(''.join(knbc.words()[:100])) print('\n\n'.join(str(tree) for tree in knbc.parsed_sents()[:2])) knbc.morphs2str = lambda morphs: '/'.join( "%s(%s)" % (m[0], m[1].split(' ')[2]) for m in morphs if m[0] != 'EOS' ).encode('utf-8') print('\n\n'.join('%s' % tree for tree in knbc.parsed_sents()[:2])) print( '\n'.join( ' '.join("%s/%s" % (w[0], w[1].split(' ')[2]) for w in sent) for sent in knbc.tagged_sents()[0:2] ) ) def test(): from nltk.corpus.util import LazyCorpusLoader knbc = LazyCorpusLoader( 'knbc/corpus1', KNBCorpusReader, r'.*/KN.*', encoding='euc-jp') assert isinstance(knbc.words()[0], string_types) assert isinstance(knbc.sents()[0][0], string_types) assert isinstance(knbc.tagged_words()[0], tuple) assert isinstance(knbc.tagged_sents()[0][0], tuple) if __name__ == '__main__': demo() nltk-3.1/nltk/corpus/reader/lin.py0000644000076500000240000001362312607224144016744 0ustar sbstaff00000000000000# Natural Language Toolkit: Lin's Thesaurus # # Copyright (C) 2001-2015 NLTK Project # Author: Dan Blanchard # URL: # For license information, see LICENSE.txt from __future__ import print_function import re from collections import defaultdict from functools import reduce from nltk.corpus.reader import CorpusReader class LinThesaurusCorpusReader(CorpusReader): """ Wrapper for the LISP-formatted thesauruses distributed by Dekang Lin. """ # Compiled regular expression for extracting the key from the first line of each # thesaurus entry _key_re = re.compile(r'\("?([^"]+)"? \(desc [0-9.]+\).+') @staticmethod def __defaultdict_factory(): ''' Factory for creating defaultdict of defaultdict(dict)s ''' return defaultdict(dict) def __init__(self, root, badscore=0.0): ''' Initialize the thesaurus. :param root: root directory containing thesaurus LISP files :type root: C{string} :param badscore: the score to give to words which do not appear in each other's sets of synonyms :type badscore: C{float} ''' super(LinThesaurusCorpusReader, self).__init__(root, r'sim[A-Z]\.lsp') self._thesaurus = defaultdict(LinThesaurusCorpusReader.__defaultdict_factory) self._badscore = badscore for path, encoding, fileid in self.abspaths(include_encoding=True, include_fileid=True): with open(path) as lin_file: first = True for line in lin_file: line = line.strip() # Start of entry if first: key = LinThesaurusCorpusReader._key_re.sub(r'\1', line) first = False # End of entry elif line == '))': first = True # Lines with pairs of ngrams and scores else: split_line = line.split('\t') if len(split_line) == 2: ngram, score = split_line self._thesaurus[fileid][key][ngram.strip('"')] = float(score) def similarity(self, ngram1, ngram2, fileid=None): ''' Returns the similarity score for two ngrams. :param ngram1: first ngram to compare :type ngram1: C{string} :param ngram2: second ngram to compare :type ngram2: C{string} :param fileid: thesaurus fileid to search in. If None, search all fileids. :type fileid: C{string} :return: If fileid is specified, just the score for the two ngrams; otherwise, list of tuples of fileids and scores. ''' # Entries don't contain themselves, so make sure similarity between item and itself is 1.0 if ngram1 == ngram2: if fileid: return 1.0 else: return [(fid, 1.0) for fid in self._fileids] else: if fileid: return self._thesaurus[fileid][ngram1][ngram2] if ngram2 in self._thesaurus[fileid][ngram1] else self._badscore else: return [(fid, (self._thesaurus[fid][ngram1][ngram2] if ngram2 in self._thesaurus[fid][ngram1] else self._badscore)) for fid in self._fileids] def scored_synonyms(self, ngram, fileid=None): ''' Returns a list of scored synonyms (tuples of synonyms and scores) for the current ngram :param ngram: ngram to lookup :type ngram: C{string} :param fileid: thesaurus fileid to search in. If None, search all fileids. :type fileid: C{string} :return: If fileid is specified, list of tuples of scores and synonyms; otherwise, list of tuples of fileids and lists, where inner lists consist of tuples of scores and synonyms. ''' if fileid: return self._thesaurus[fileid][ngram].items() else: return [(fileid, self._thesaurus[fileid][ngram].items()) for fileid in self._fileids] def synonyms(self, ngram, fileid=None): ''' Returns a list of synonyms for the current ngram. :param ngram: ngram to lookup :type ngram: C{string} :param fileid: thesaurus fileid to search in. If None, search all fileids. :type fileid: C{string} :return: If fileid is specified, list of synonyms; otherwise, list of tuples of fileids and lists, where inner lists contain synonyms. ''' if fileid: return self._thesaurus[fileid][ngram].keys() else: return [(fileid, self._thesaurus[fileid][ngram].keys()) for fileid in self._fileids] def __contains__(self, ngram): ''' Determines whether or not the given ngram is in the thesaurus. :param ngram: ngram to lookup :type ngram: C{string} :return: whether the given ngram is in the thesaurus. ''' return reduce(lambda accum, fileid: accum or (ngram in self._thesaurus[fileid]), self._fileids, False) ###################################################################### # Demo ###################################################################### def demo(): from nltk.corpus import lin_thesaurus as thes word1 = "business" word2 = "enterprise" print("Getting synonyms for " + word1) print(thes.synonyms(word1)) print("Getting scored synonyms for " + word1) print(thes.scored_synonyms(word1)) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Getting synonyms from simN.lsp (noun subsection) for " + word1) print(thes.synonyms(word1, fileid="simN.lsp")) print("Similarity score for %s and %s:" % (word1, word2)) print(thes.similarity(word1, word2)) if __name__ == '__main__': demo() nltk-3.1/nltk/corpus/reader/mte.py0000644000076500000240000003101212607224144016737 0ustar sbstaff00000000000000""" A reader for corpora whose documents are in MTE format. """ import os from functools import reduce from nltk import compat from nltk.corpus.reader import concat, TaggedCorpusReader lxmlAvailable = False try: from lxml import etree lxmlAvailable = True except ImportError: #first try c version of ElementTree try: import xml.etree.cElementTree as etree except ImportError: import xml.etree.ElementTree as etree import re def xpath(root, path, ns): if lxmlAvailable: return root.xpath(path, namespaces=ns) else: return root.findall(path, ns) class MTEFileReader: """ Class for loading the content of the multext-east corpus. It parses the xml files and does some tag-filtering depending on the given method parameters. """ ns = {'tei': 'http://www.tei-c.org/ns/1.0', 'xml': 'http://www.w3.org/XML/1998/namespace'} tag_ns = '{http://www.tei-c.org/ns/1.0}' xml_ns = '{http://www.w3.org/XML/1998/namespace}' def __init__(self, file_path): tree = etree.parse(file_path) self.__root = xpath(tree.getroot(), './tei:text/tei:body', self.ns)[0] @classmethod def _words(self, text_root): return [w.text for w in xpath(text_root, './/*', self.ns) if w.tag == self.tag_ns + "w" or w.tag == self.tag_ns + "c"] @classmethod def _sents(self, text_root): return [MTEFileReader._words(s) for s in xpath(text_root, './/tei:s', self.ns)] @classmethod def _paras(self, text_root): return [MTEFileReader._sents(p) for p in xpath(text_root, './/tei:p', self.ns)] @classmethod def _lemma_words(self, text_root): return [(w.text, w.attrib['lemma']) for w in xpath(text_root, './/tei:w', self.ns)] @classmethod def _tagged_words(self, text_root, tags=""): if tags is None or tags == "": return [(w.text, w.attrib['ana']) for w in xpath(text_root, './/tei:w', self.ns)] else: tags = re.compile('^' + re.sub("-",".",tags) + '.*$') return [(w.text, w.attrib['ana']) for w in xpath(text_root, './/tei:w', self.ns) if tags.match(w.attrib['ana'])] @classmethod def _lemma_sents(self, text_root): return [MTEFileReader._lemma_words(s) for s in xpath(text_root, './/tei:s', self.ns)] @classmethod def _tagged_sents(self, text_root, tags=""): # double list comprehension to remove empty sentences in case there is a sentence only containing punctuation marks return [t for t in [MTEFileReader._tagged_words(s, tags) for s in xpath(text_root, './/tei:s', self.ns)] if len(t) > 0] @classmethod def _lemma_paras(self, text_root): return [MTEFileReader._lemma_sents(p) for p in xpath(text_root, './/tei:p', self.ns)] @classmethod def _tagged_paras(self, text_root, tags=""): return [t for t in [MTEFileReader._tagged_sents(p, tags) for p in xpath(text_root, './/tei:p', self.ns)] if len(t) > 0] def words(self): return MTEFileReader._words(self.__root) def sents(self): return MTEFileReader._sents(self.__root) def paras(self): return MTEFileReader._paras(self.__root) def lemma_words(self): return MTEFileReader._lemma_words(self.__root) def tagged_words(self, tags=""): return MTEFileReader._tagged_words(self.__root, tags) def lemma_sents(self): return MTEFileReader._lemma_sents(self.__root) def tagged_sents(self, tags=""): return MTEFileReader._tagged_sents(self.__root) def lemma_paras(self): return MTEFileReader._lemma_paras(self.__root) def tagged_paras(self, tags=""): return MTEFileReader._tagged_paras(self.__root) class MTETagConverter: """ Class for converting msd tags to universal tags, more conversion options are currently not implemented. """ mapping_msd_universal = { 'A': 'ADJ', 'S': 'ADP', 'R': 'ADV', 'C': 'CONJ', 'D': 'DET', 'N': 'NOUN', 'M': 'NUM', 'Q': 'PRT', 'P': 'PRON', 'V': 'VERB', '.': '.', '-': 'X'} @staticmethod def msd_to_universal(tag): """ This function converts the annotation from the Multex-East to the universal tagset as described in Chapter 5 of the NLTK-Book Unknown Tags will be mapped to X. Punctuation marks are not supported in MSD tags, so """ indicator = tag[0] if not tag[0] == "#" else tag[1] if not indicator in MTETagConverter.mapping_msd_universal: indicator = '-' return MTETagConverter.mapping_msd_universal[indicator] class MTECorpusReader(TaggedCorpusReader): """ Reader for corpora following the TEI-p5 xml scheme, such as MULTEXT-East. MULTEXT-East contains part-of-speech-tagged words with a quite precise tagging scheme. These tags can be converted to the Universal tagset """ def __init__(self, root=None, fileids=None, encoding='utf8'): """ Construct a new MTECorpusreader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = MTECorpusReader(root, 'oana-*.xml', 'utf8') # doctest: +SKIP :param root: The root directory for this corpus. (default points to location in multext config file) :param fileids: A list or regexp specifying the fileids in this corpus. (default is oana-en.xml) :param enconding: The encoding of the given files (default is utf8) """ TaggedCorpusReader.__init__(self, root, fileids, encoding) def __fileids(self, fileids): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] # filter wrong userinput fileids = filter(lambda x : x in self._fileids, fileids) # filter multext-east sourcefiles that are not compatible to the teip5 specification fileids = filter(lambda x : x not in ["oana-bg.xml", "oana-mk.xml"], fileids) if not fileids: print("No valid multext-east file specified") return fileids def readme(self): """ Prints some information about this corpus. :return: the content of the attached README file :rtype: str """ return self.open("00README.txt").read() def raw(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a single string. :rtype: str """ return concat([self.open(f).read() for f in self.__fileids(fileids)]) def words(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).words() for f in self.__fileids(fileids)], []) def sents(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings :rtype: list(list(str)) """ return reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).sents() for f in self.__fileids(fileids)], []) def paras(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word string :rtype: list(list(list(str))) """ return reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).paras() for f in self.__fileids(fileids)], []) def lemma_words(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of words, the corresponding lemmas and punctuation symbols, encoded as tuples (word, lemma) :rtype: list(tuple(str,str)) """ return reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).lemma_words() for f in self.__fileids(fileids)], []) def tagged_words(self, fileids=None, tagset="msd", tags=None): """ :param fileids: A list specifying the fileids that should be used. :param tagset: The tagset that should be used in the returned object, either "universal" or "msd", "msd" is the default :param tags: An MSD Tag that is used to filter all parts of the used corpus that are not more precise or at least equal to the given tag :return: the given file(s) as a list of tagged words and punctuation symbols encoded as tuples (word, tag) :rtype: list(tuple(str, str)) """ words = reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).tagged_words(tags=tags) for f in self.__fileids(fileids)], []) if tagset == "universal": return map(lambda wt : (wt[0], MTETagConverter.msd_to_universal(wt[1])), words) elif tagset == "msd": return words else: print("Unknown tagset specified.") def lemma_sents(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of sentences or utterances, each encoded as a list of tuples of the word and the corresponding lemma (word, lemma) :rtype: list(list(tuple(str, str))) """ return reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).lemma_sents() for f in self.__fileids(fileids)], []) def tagged_sents(self, fileids=None, tagset="msd", tags=None): """ :param fileids: A list specifying the fileids that should be used. :param tagset: The tagset that should be used in the returned object, either "universal" or "msd", "msd" is the default :param tags: An MSD Tag that is used to filter all parts of the used corpus that are not more precise or at least equal to the given tag :return: the given file(s) as a list of sentences or utterances, each each encoded as a list of (word,tag) tuples :rtype: list(list(tuple(str, str))) """ sents = reduce(lambda a, b : a + b, [MTEFileReader(os.path.join(self._root, f)).tagged_sents(tags=tags) for f in self.__fileids(fileids)], []) if tagset == "universal": return map(lambda s : map (lambda wt : (wt[0], MTETagConverter.msd_to_universal(wt[1])), s), sents) elif tagset == "msd": return sents else: print("Unknown tagset specified.") def lemma_paras(self, fileids=None): """ :param fileids: A list specifying the fileids that should be used. :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as a list of tuples of the word and the corresponding lemma (word, lemma) :rtype: list(List(List(tuple(str, str)))) """ return reduce(lambda a, b : a + b ,[MTEFileReader(os.path.join(self._root, f)).lemma_paras() for f in self.__fileids(fileids)], []) def tagged_paras(self, fileids=None, tagset="msd", tags=None): """ :param fileids: A list specifying the fileids that should be used. :param tagset: The tagset that should be used in the returned object, either "universal" or "msd", "msd" is the default :param tags: An MSD Tag that is used to filter all parts of the used corpus that are not more precise or at least equal to the given tag :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as a list of (word,tag) tuples :rtype: list(list(list(tuple(str, str)))) """ paras = reduce(lambda a, b : a + b, [MTEFileReader(os.path.join(self._root, f)).tagged_paras(tags=tags) for f in self.__fileids(fileids)], []) if tagset == "universal": return map(lambda p : map(lambda s : map (lambda wt : (wt[0], MTETagConverter.msd_to_universal(wt[0])), s), p), paras) elif tagset == "msd": return paras else: print("Unknown tagset specified.") nltk-3.1/nltk/corpus/reader/nkjp.py0000644000076500000240000003506612607224144017131 0ustar sbstaff00000000000000# Natural Language Toolkit: NKJP Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Gabriela Kaczka # URL: # For license information, see LICENSE.TXT import functools import os import tempfile from nltk import compat from nltk.corpus.reader.util import concat from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView import re def _parse_args(fun): """ Wraps function arguments: if fileids not specified then function set NKJPCorpusReader paths. """ @functools.wraps(fun) def decorator(self, fileids=None, **kwargs): if not fileids: fileids = self._paths return fun(self, fileids, **kwargs) return decorator class NKJPCorpusReader(XMLCorpusReader): WORDS_MODE = 0 SENTS_MODE = 1 HEADER_MODE = 2 RAW_MODE = 3 def __init__(self, root, fileids='.*'): """ Corpus reader designed to work with National Corpus of Polish. See http://nkjp.pl/ for more details about NKJP. use example: import nltk import nkjp from nkjp import NKJPCorpusReader x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='') # obtain the whole corpus x.header() x.raw() x.words() x.tagged_words(tags=['subst', 'comp']) #Link to find more tags: nkjp.pl/poliqarp/help/ense2.html x.sents() x = NKJPCorpusReader(root='/home/USER/nltk_data/corpora/nkjp/', fileids='Wilk*') # obtain particular file(s) x.header(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy']) x.tagged_words(fileids=['WilkDom', '/home/USER/nltk_data/corpora/nkjp/WilkWilczy'], tags=['subst', 'comp']) """ if isinstance(fileids, compat.string_types): XMLCorpusReader.__init__(self, root, fileids + '.*/header.xml') else: XMLCorpusReader.__init__(self, root, [fileid + '/header.xml' for fileid in fileids]) self._paths = self.get_paths() def get_paths(self): return [os.path.join(str(self._root), f.split("header.xml")[0]) for f in self._fileids] def fileids(self): """ Returns a list of file identifiers for the fileids that make up this corpus. """ return [f.split("header.xml")[0] for f in self._fileids] def _view(self, filename, tags=None, **kwargs): """ Returns a view specialised for use with particular corpus file. """ mode = kwargs.pop('mode', NKJPCorpusReader.WORDS_MODE) if mode is NKJPCorpusReader.WORDS_MODE: return NKJPCorpus_Morph_View(filename, tags=tags) elif mode is NKJPCorpusReader.SENTS_MODE: return NKJPCorpus_Segmentation_View(filename, tags=tags) elif mode is NKJPCorpusReader.HEADER_MODE: return NKJPCorpus_Header_View(filename, tags=tags) elif mode is NKJPCorpusReader.RAW_MODE: return NKJPCorpus_Text_View(filename, tags=tags, mode=NKJPCorpus_Text_View.RAW_MODE) else: raise NameError('No such mode!') def add_root(self, fileid): """ Add root if necessary to specified fileid. """ if self.root in fileid: return fileid return self.root + fileid @_parse_args def header(self, fileids=None, **kwargs): """ Returns header(s) of specified fileids. """ return concat([self._view(self.add_root(fileid), mode=NKJPCorpusReader.HEADER_MODE, **kwargs).handle_query() for fileid in fileids]) @_parse_args def sents(self, fileids=None, **kwargs): """ Returns sentences in specified fileids. """ return concat([self._view(self.add_root(fileid), mode=NKJPCorpusReader.SENTS_MODE, **kwargs).handle_query() for fileid in fileids]) @_parse_args def words(self, fileids=None, **kwargs): """ Returns words in specified fileids. """ return concat([self._view(self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, **kwargs).handle_query() for fileid in fileids]) @_parse_args def tagged_words(self, fileids=None, **kwargs): """ Call with specified tags as a list, e.g. tags=['subst', 'comp']. Returns tagged words in specified fileids. """ tags = kwargs.pop('tags', []) return concat([self._view(self.add_root(fileid), mode=NKJPCorpusReader.WORDS_MODE, tags=tags, **kwargs).handle_query() for fileid in fileids]) @_parse_args def raw(self, fileids=None, **kwargs): """ Returns words in specified fileids. """ return concat([self._view(self.add_root(fileid), mode=NKJPCorpusReader.RAW_MODE, **kwargs).handle_query() for fileid in fileids]) class NKJPCorpus_Header_View(XMLCorpusView): def __init__(self, filename, **kwargs): """ HEADER_MODE A stream backed corpus view specialized for use with header.xml files in NKJP corpus. """ self.tagspec = ".*/sourceDesc$" XMLCorpusView.__init__(self, filename + 'header.xml', self.tagspec) def handle_query(self): self._open() header = [] while True: segm = XMLCorpusView.read_block(self, self._stream) if len(segm) == 0: break header.extend(segm) self.close() return header def handle_elt(self, elt, context): titles = elt.findall('bibl/title') title = [] if titles: title = '\n'.join(title.text.strip() for title in titles) authors = elt.findall('bibl/author') author = [] if authors: author = '\n'.join(author.text.strip() for author in authors) dates = elt.findall('bibl/date') date = [] if dates: date = '\n'.join(date.text.strip() for date in dates) publishers = elt.findall('bibl/publisher') publisher = [] if publishers: publisher = '\n'.join(publisher.text.strip() for publisher in publishers) idnos = elt.findall('bibl/idno') idno = [] if idnos: idno = '\n'.join(idno.text.strip() for idno in idnos) notes = elt.findall('bibl/note') note = [] if notes: note = '\n'.join(note.text.strip() for note in notes) return {'title': title, 'author': author, 'date': date, 'publisher': publisher, 'idno': idno, 'note': note} class XML_Tool(): """ Helper class creating xml file to one without references to nkjp: namespace. That's needed because the XMLCorpusView assumes that one can find short substrings of XML that are valid XML, which is not true if a namespace is declared at top level """ def __init__(self, root, filename): self.read_file = os.path.join(root, filename) self.write_file = tempfile.NamedTemporaryFile(delete=False) def build_preprocessed_file(self): try: fr = open(self.read_file, 'r') fw = self.write_file line = ' ' while len(line): line = fr.readline() x = re.split(r'nkjp:[^ ]* ', line) #in all files ret = ' '.join(x) x = re.split('', ret) #in ann_segmentation.xml ret = ' '.join(x) x = re.split('', ret) #in ann_segmentation.xml ret = ' '.join(x) x = re.split('', ret) #in ann_segmentation.xml ret = ' '.join(x) x = re.split('', ret) #in ann_segmentation.xml ret = ' '.join(x) fw.write(ret) fr.close() fw.close() return self.write_file.name except Exception: self.remove_preprocessed_file() raise Exception def remove_preprocessed_file(self): os.remove(self.write_file.name) pass class NKJPCorpus_Segmentation_View(XMLCorpusView): """ A stream backed corpus view specialized for use with ann_segmentation.xml files in NKJP corpus. """ def __init__(self, filename, **kwargs): self.tagspec = '.*p/.*s' #intersperse NKJPCorpus_Text_View self.text_view = NKJPCorpus_Text_View(filename, mode=NKJPCorpus_Text_View.SENTS_MODE) self.text_view.handle_query() #xml preprocessing self.xml_tool = XML_Tool(filename, 'ann_segmentation.xml') #base class init XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec) def get_segm_id(self, example_word): return example_word.split('(')[1].split(',')[0] def get_sent_beg(self, beg_word): #returns index of beginning letter in sentence return int(beg_word.split(',')[1]) def get_sent_end(self, end_word): #returns index of end letter in sentence splitted = end_word.split(')')[0].split(',') return int(splitted[1]) + int(splitted[2]) def get_sentences(self, sent_segm): #returns one sentence id = self.get_segm_id(sent_segm[0]) segm = self.text_view.segm_dict[id] #text segment beg = self.get_sent_beg(sent_segm[0]) end = self.get_sent_end(sent_segm[len(sent_segm)-1]) return segm[beg:end] def remove_choice(self, segm): ret = [] prev_txt_end = -1 prev_txt_nr = -1 for word in segm: txt_nr = self.get_segm_id(word) #get increasing sequence of ids: in case of choice get first possibility if self.get_sent_beg(word) > prev_txt_end-1 or prev_txt_nr != txt_nr: ret.append(word) prev_txt_end = self.get_sent_end(word) prev_txt_nr = txt_nr return ret def handle_query(self): try: self._open() sentences = [] while True: sent_segm = XMLCorpusView.read_block(self, self._stream) if len(sent_segm) == 0: break for segm in sent_segm: segm = self.remove_choice(segm) sentences.append(self.get_sentences(segm)) self.close() self.xml_tool.remove_preprocessed_file() return sentences except Exception: self.xml_tool.remove_preprocessed_file() raise Exception def handle_elt(self, elt, context): ret = [] for seg in elt: ret.append(seg.get('corresp')) return ret class NKJPCorpus_Text_View(XMLCorpusView): """ A stream backed corpus view specialized for use with text.xml files in NKJP corpus. """ SENTS_MODE = 0 RAW_MODE = 1 def __init__(self, filename, **kwargs): self.mode = kwargs.pop('mode', 0) self.tagspec = '.*/div/ab' self.segm_dict = dict() #xml preprocessing self.xml_tool = XML_Tool(filename, 'text.xml') #base class init XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec) def handle_query(self): try: self._open() x = self.read_block(self._stream) self.close() self.xml_tool.remove_preprocessed_file() return x except Exception: self.xml_tool.remove_preprocessed_file() raise Exception def read_block(self, stream, tagspec=None, elt_handler=None): """ Returns text as a list of sentences. """ txt = [] while True: segm = XMLCorpusView.read_block(self, stream) if len(segm) == 0: break for part in segm: txt.append(part) return [' '.join([segm for segm in txt])] def get_segm_id(self, elt): for attr in elt.attrib: if attr.endswith('id'): return elt.get(attr) def handle_elt(self, elt, context): #fill dictionary to use later in sents mode if self.mode is NKJPCorpus_Text_View.SENTS_MODE: self.segm_dict[self.get_segm_id(elt)] = elt.text return elt.text class NKJPCorpus_Morph_View(XMLCorpusView): """ A stream backed corpus view specialized for use with ann_morphosyntax.xml files in NKJP corpus. """ def __init__(self, filename, **kwargs): self.tags = kwargs.pop('tags', None) self.tagspec = '.*/seg/fs' self.xml_tool = XML_Tool(filename, 'ann_morphosyntax.xml') XMLCorpusView.__init__(self, self.xml_tool.build_preprocessed_file(), self.tagspec) def handle_query(self): try: self._open() words = [] while True: segm = XMLCorpusView.read_block(self, self._stream) if len(segm) == 0: break for part in segm: if part is not None: words.append(part) self.close() self.xml_tool.remove_preprocessed_file() return words except Exception: self.xml_tool.remove_preprocessed_file() raise Exception def handle_elt(self, elt, context): word = '' flag = False is_not_interp = True #if tags not specified, then always return word if self.tags is None: flag = True for child in elt: #get word if 'name' in child.keys() and child.attrib['name'] == 'orth': for symbol in child: if symbol.tag == 'string': word = symbol.text elif 'name' in child.keys() and child.attrib['name'] == 'interps': for symbol in child: if 'type' in symbol.keys() and symbol.attrib['type'] == 'lex': for symbol2 in symbol: if 'name' in symbol2.keys() and symbol2.attrib['name'] == 'ctag': for symbol3 in symbol2: if 'value' in symbol3.keys() and self.tags is not None and symbol3.attrib['value'] in self.tags: flag = True elif 'value' in symbol3.keys() and symbol3.attrib['value'] == 'interp': is_not_interp = False if flag and is_not_interp: return word nltk-3.1/nltk/corpus/reader/nombank.py0000644000076500000240000003767512607224144017624 0ustar sbstaff00000000000000# Natural Language Toolkit: NomBank Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Authors: Paul Bedaride # Edward Loper # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals from nltk.tree import Tree from xml.etree import ElementTree from nltk.internals import raise_unorderable_types from nltk.compat import total_ordering, python_2_unicode_compatible, string_types from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class NombankCorpusReader(CorpusReader): """ Corpus reader for the nombank corpus, which augments the Penn Treebank with information about the predicate argument structure of every noun instance. The corpus consists of two parts: the predicate-argument annotations themselves, and a set of "frameset files" which define the argument labels used by the annotations, on a per-noun basis. Each "frameset file" contains one or more predicates, such as ``'turn'`` or ``'turn_on'``, each of which is divided into coarse-grained word senses called "rolesets". For each "roleset", the frameset file provides descriptions of the argument roles, along with examples. """ def __init__(self, root, nomfile, framefiles='', nounsfile=None, parse_fileid_xform=None, parse_corpus=None, encoding='utf8'): """ :param root: The root directory for this corpus. :param nomfile: The name of the file containing the predicate- argument annotations (relative to ``root``). :param framefiles: A list or regexp specifying the frameset fileids for this corpus. :param parse_fileid_xform: A transform that should be applied to the fileids in this corpus. This should be a function of one argument (a fileid) that returns a string (the new fileid). :param parse_corpus: The corpus containing the parse trees corresponding to this corpus. These parse trees are necessary to resolve the tree pointers used by nombank. """ # If framefiles is specified as a regexp, expand it. if isinstance(framefiles, string_types): framefiles = find_corpus_fileids(root, framefiles) framefiles = list(framefiles) # Initialze the corpus reader. CorpusReader.__init__(self, root, [nomfile, nounsfile] + framefiles, encoding) # Record our frame fileids & nom file. self._nomfile = nomfile self._framefiles = framefiles self._nounsfile = nounsfile self._parse_fileid_xform = parse_fileid_xform self._parse_corpus = parse_corpus def raw(self, fileids=None): """ :return: the text contents of the given fileids, as a single string. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def instances(self, baseform=None): """ :return: a corpus view that acts as a list of ``NombankInstance`` objects, one for each noun in the corpus. """ kwargs = {} if baseform is not None: kwargs['instance_filter'] = lambda inst: inst.baseform==baseform return StreamBackedCorpusView(self.abspath(self._nomfile), lambda stream: self._read_instance_block(stream, **kwargs), encoding=self.encoding(self._nomfile)) def lines(self): """ :return: a corpus view that acts as a list of strings, one for each line in the predicate-argument annotation file. """ return StreamBackedCorpusView(self.abspath(self._nomfile), read_line_block, encoding=self.encoding(self._nomfile)) def roleset(self, roleset_id): """ :return: the xml description for the given roleset. """ baseform = roleset_id.split('.')[0] baseform = baseform.replace('perc-sign','%') baseform = baseform.replace('oneslashonezero', '1/10').replace('1/10','1-slash-10') framefile = 'frames/%s.xml' % baseform if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % roleset_id) # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. etree = ElementTree.parse(self.abspath(framefile).open()).getroot() for roleset in etree.findall('predicate/roleset'): if roleset.attrib['id'] == roleset_id: return roleset else: raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile)) def rolesets(self, baseform=None): """ :return: list of xml descriptions for rolesets. """ if baseform is not None: framefile = 'frames/%s.xml' % baseform if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % baseform) framefiles = [framefile] else: framefiles = self._framefiles rsets = [] for framefile in framefiles: # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. etree = ElementTree.parse(self.abspath(framefile).open()).getroot() rsets.append(etree.findall('predicate/roleset')) return LazyConcatenation(rsets) def nouns(self): """ :return: a corpus view that acts as a list of all noun lemmas in this corpus (from the nombank.1.0.words file). """ return StreamBackedCorpusView(self.abspath(self._nounsfile), read_line_block, encoding=self.encoding(self._nounsfile)) def _read_instance_block(self, stream, instance_filter=lambda inst: True): block = [] # Read 100 at a time. for i in range(100): line = stream.readline().strip() if line: inst = NombankInstance.parse( line, self._parse_fileid_xform, self._parse_corpus) if instance_filter(inst): block.append(inst) return block ###################################################################### #{ Nombank Instance & related datatypes ###################################################################### @python_2_unicode_compatible class NombankInstance(object): def __init__(self, fileid, sentnum, wordnum, baseform, sensenumber, predicate, predid, arguments, parse_corpus=None): self.fileid = fileid """The name of the file containing the parse tree for this instance's sentence.""" self.sentnum = sentnum """The sentence number of this sentence within ``fileid``. Indexing starts from zero.""" self.wordnum = wordnum """The word number of this instance's predicate within its containing sentence. Word numbers are indexed starting from zero, and include traces and other empty parse elements.""" self.baseform = baseform """The baseform of the predicate.""" self.sensenumber = sensenumber """The sense number of the predicate.""" self.predicate = predicate """A ``NombankTreePointer`` indicating the position of this instance's predicate within its containing sentence.""" self.predid = predid """Identifier of the predicate.""" self.arguments = tuple(arguments) """A list of tuples (argloc, argid), specifying the location and identifier for each of the predicate's argument in the containing sentence. Argument identifiers are strings such as ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain the predicate.""" self.parse_corpus = parse_corpus """A corpus reader for the parse trees corresponding to the instances in this nombank corpus.""" @property def roleset(self): """The name of the roleset used by this instance's predicate. Use ``nombank.roleset() `` to look up information about the roleset.""" r = self.baseform.replace('%', 'perc-sign') r = r.replace('1/10', '1-slash-10').replace('1-slash-10', 'oneslashonezero') return '%s.%s'%(r, self.sensenumber) def __repr__(self): return ('' % (self.fileid, self.sentnum, self.wordnum)) def __str__(self): s = '%s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum, self.baseform, self.sensenumber) items = self.arguments + ((self.predicate, 'rel'),) for (argloc, argid) in sorted(items): s += ' %s-%s' % (argloc, argid) return s def _get_tree(self): if self.parse_corpus is None: return None if self.fileid not in self.parse_corpus.fileids(): return None return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] tree = property(_get_tree, doc=""" The parse tree corresponding to this instance, or None if the corresponding tree is not available.""") @staticmethod def parse(s, parse_fileid_xform=None, parse_corpus=None): pieces = s.split() if len(pieces) < 6: raise ValueError('Badly formatted nombank line: %r' % s) # Divide the line into its basic pieces. (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5] args = pieces[5:] rel = [args.pop(i) for i,p in enumerate(args) if '-rel' in p] if len(rel) != 1: raise ValueError('Badly formatted nombank line: %r' % s) # Apply the fileid selector, if any. if parse_fileid_xform is not None: fileid = parse_fileid_xform(fileid) # Convert sentence & word numbers to ints. sentnum = int(sentnum) wordnum = int(wordnum) # Parse the predicate location. predloc, predid = rel[0].split('-', 1) predicate = NombankTreePointer.parse(predloc) # Parse the arguments. arguments = [] for arg in args: argloc, argid = arg.split('-', 1) arguments.append( (NombankTreePointer.parse(argloc), argid) ) # Put it all together. return NombankInstance(fileid, sentnum, wordnum, baseform, sensenumber, predicate, predid, arguments, parse_corpus) class NombankPointer(object): """ A pointer used by nombank to identify one or more constituents in a parse tree. ``NombankPointer`` is an abstract base class with three concrete subclasses: - ``NombankTreePointer`` is used to point to single constituents. - ``NombankSplitTreePointer`` is used to point to 'split' constituents, which consist of a sequence of two or more ``NombankTreePointer`` pointers. - ``NombankChainTreePointer`` is used to point to entire trace chains in a tree. It consists of a sequence of pieces, which can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers. """ def __init__(self): if self.__class__ == NombankPointer: raise NotImplementedError() @python_2_unicode_compatible class NombankChainTreePointer(NombankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements may be either ``NombankSplitTreePointer`` or ``NombankTreePointer`` pointers.""" def __str__(self): return '*'.join('%s' % p for p in self.pieces) def __repr__(self): return '' % self def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*CHAIN*', [p.select(tree) for p in self.pieces]) @python_2_unicode_compatible class NombankSplitTreePointer(NombankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements are all ``NombankTreePointer`` pointers.""" def __str__(self): return ','.join('%s' % p for p in self.pieces) def __repr__(self): return '' % self def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*SPLIT*', [p.select(tree) for p in self.pieces]) @total_ordering @python_2_unicode_compatible class NombankTreePointer(NombankPointer): """ wordnum:height*wordnum:height*... wordnum:height, """ def __init__(self, wordnum, height): self.wordnum = wordnum self.height = height @staticmethod def parse(s): # Deal with chains (xx*yy*zz) pieces = s.split('*') if len(pieces) > 1: return NombankChainTreePointer([NombankTreePointer.parse(elt) for elt in pieces]) # Deal with split args (xx,yy,zz) pieces = s.split(',') if len(pieces) > 1: return NombankSplitTreePointer([NombankTreePointer.parse(elt) for elt in pieces]) # Deal with normal pointers. pieces = s.split(':') if len(pieces) != 2: raise ValueError('bad nombank pointer %r' % s) return NombankTreePointer(int(pieces[0]), int(pieces[1])) def __str__(self): return '%s:%s' % (self.wordnum, self.height) def __repr__(self): return 'NombankTreePointer(%d, %d)' % (self.wordnum, self.height) def __eq__(self, other): while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, NombankTreePointer): return self is other return (self.wordnum == other.wordnum and self.height == other.height) def __ne__(self, other): return not self == other def __lt__(self, other): while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, NombankTreePointer): return id(self) < id(other) return (self.wordnum, -self.height) < (other.wordnum, -other.height) def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return tree[self.treepos(tree)] def treepos(self, tree): """ Convert this pointer to a standard 'tree position' pointer, given that it points to the given tree. """ if tree is None: raise ValueError('Parse tree not avaialable') stack = [tree] treepos = [] wordnum = 0 while True: #print treepos #print stack[-1] # tree node: if isinstance(stack[-1], Tree): # Select the next child. if len(treepos) < len(stack): treepos.append(0) else: treepos[-1] += 1 # Update the stack. if treepos[-1] < len(stack[-1]): stack.append(stack[-1][treepos[-1]]) else: # End of node's child list: pop up a level. stack.pop() treepos.pop() # word node: else: if wordnum == self.wordnum: return tuple(treepos[:len(treepos)-self.height-1]) else: wordnum += 1 stack.pop() nltk-3.1/nltk/corpus/reader/nps_chat.py0000644000076500000240000000524612607224144017763 0ustar sbstaff00000000000000# Natural Language Toolkit: NPS Chat Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals import re import textwrap from nltk.util import LazyConcatenation from nltk.internals import ElementWrapper from nltk.tag import map_tag from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * from nltk.corpus.reader.xmldocs import * class NPSChatCorpusReader(XMLCorpusReader): def __init__(self, root, fileids, wrap_etree=False, tagset=None): XMLCorpusReader.__init__(self, root, fileids, wrap_etree) self._tagset = tagset def xml_posts(self, fileids=None): if self._wrap_etree: return concat([XMLCorpusView(fileid, 'Session/Posts/Post', self._wrap_elt) for fileid in self.abspaths(fileids)]) else: return concat([XMLCorpusView(fileid, 'Session/Posts/Post') for fileid in self.abspaths(fileids)]) def posts(self, fileids=None): return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals', self._elt_to_words) for fileid in self.abspaths(fileids)]) def tagged_posts(self, fileids=None, tagset=None): def reader(elt, handler): return self._elt_to_tagged_words(elt, handler, tagset) return concat([XMLCorpusView(fileid, 'Session/Posts/Post/terminals', reader) for fileid in self.abspaths(fileids)]) def words(self, fileids=None): return LazyConcatenation(self.posts(fileids)) def tagged_words(self, fileids=None, tagset=None): return LazyConcatenation(self.tagged_posts(fileids, tagset)) def _wrap_elt(self, elt, handler): return ElementWrapper(elt) def _elt_to_words(self, elt, handler): return [self._simplify_username(t.attrib['word']) for t in elt.findall('t')] def _elt_to_tagged_words(self, elt, handler, tagset=None): tagged_post = [(self._simplify_username(t.attrib['word']), t.attrib['pos']) for t in elt.findall('t')] if tagset and tagset != self._tagset: tagged_post = [(w, map_tag(self._tagset, tagset, t)) for (w, t) in tagged_post] return tagged_post @staticmethod def _simplify_username(word): if 'User' in word: word = 'U' + word.split('User', 1)[1] elif isinstance(word, bytes): word = word.decode('ascii') return word nltk-3.1/nltk/corpus/reader/opinion_lexicon.py0000644000076500000240000000761112607224144021356 0ustar sbstaff00000000000000# Natural Language Toolkit: Opinion Lexicon Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader for the Opinion Lexicon. - Opinion Lexicon information - Authors: Minqing Hu and Bing Liu, 2004. Department of Computer Sicence University of Illinois at Chicago Contact: Bing Liu, liub@cs.uic.edu http://www.cs.uic.edu/~liub Distributed with permission. Related papers: - Minqing Hu and Bing Liu. "Mining and summarizing customer reviews". Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA. - Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing Opinions on the Web". Proceedings of the 14th International World Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan. """ from nltk.compat import string_types from nltk.corpus.reader import WordListCorpusReader from nltk.corpus.reader.api import * class IgnoreReadmeCorpusView(StreamBackedCorpusView): """ This CorpusView is used to skip the initial readme block of the corpus. """ def __init__(self, *args, **kwargs): StreamBackedCorpusView.__init__(self, *args, **kwargs) # open self._stream self._open() # skip the readme block read_blankline_block(self._stream) # Set the initial position to the current stream position self._filepos = [self._stream.tell()] class OpinionLexiconCorpusReader(WordListCorpusReader): """ Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored. >>> from nltk.corpus import opinion_lexicon >>> opinion_lexicon.words() ['2-faced', '2-faces', 'abnormal', 'abolish', ...] The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative words: >>> opinion_lexicon.negative() ['2-faced', '2-faces', 'abnormal', 'abolish', ...] Note that words from `words()` method are sorted by file id, not alphabetically: >>> opinion_lexicon.words()[0:10] ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted'] >>> sorted(opinion_lexicon.words())[0:10] ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort'] """ CorpusView = IgnoreReadmeCorpusView def words(self, fileids=None): """ Return all words in the opinion lexicon. Note that these words are not sorted in alphabetical order. :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def positive(self): """ Return all positive words in alphabetical order. :return: a list of positive words. :rtype: list(str) """ return self.words('positive-words.txt') def negative(self): """ Return all negative words in alphabetical order. :return: a list of negative words. :rtype: list(str) """ return self.words('negative-words.txt') def _read_word_block(self, stream): words = [] for i in range(20): # Read 20 lines at a time. line = stream.readline() if not line: continue words.append(line.strip()) return words nltk-3.1/nltk/corpus/reader/pl196x.py0000644000076500000240000002120012607224144017213 0ustar sbstaff00000000000000# Natural Language Toolkit: # # Copyright (C) 2001-2015 NLTK Project # Author: Piotr Kasprzyk # URL: # For license information, see LICENSE.TXT import os import re from nltk import compat from nltk import tokenize, tree from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * from nltk.corpus.reader.xmldocs import XMLCorpusReader # (?:something) -- non-capturing parentheses! PARA = re.compile(r']*){0,1}>(.*?)

    ') SENT = re.compile(r']*){0,1}>(.*?)
    ') TAGGEDWORD = re.compile(r'<([wc](?: [^>]*){0,1}>)(.*?)') WORD = re.compile(r'<[wc](?: [^>]*){0,1}>(.*?)') TYPE = re.compile(r'type="(.*?)"') ANA = re.compile(r'ana="(.*?)"') TEXTID = re.compile(r'text id="(.*?)"') class TEICorpusView(StreamBackedCorpusView): def __init__(self, corpus_file, tagged, group_by_sent, group_by_para, tagset=None, headLen=0, textids=None): self._tagged = tagged self._textids = textids self._group_by_sent = group_by_sent self._group_by_para = group_by_para # WARNING -- skip header StreamBackedCorpusView.__init__(self, corpus_file, startpos=headLen) _pagesize = 4096 def read_block(self, stream): block = stream.readlines(self._pagesize) block = concat(block) while (block.count(' block.count('')) \ or block.count('')+len('') block = block[ :beg]+block[beg+end: ] output = [] for para_str in PARA.findall(block): para = [] for sent_str in SENT.findall(para_str): if not self._tagged: sent = WORD.findall(sent_str) else: sent = list(map(self._parse_tag, TAGGEDWORD.findall(sent_str))) if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: output.append(para) else: output.extend(para) return output def _parse_tag(self, tag_word_tuple): (tag, word) = tag_word_tuple if tag.startswith('w'): tag = ANA.search(tag).group(1) else: # tag.startswith('c') tag = TYPE.search(tag).group(1) return (word, tag) class Pl196xCorpusReader(CategorizedCorpusReader, XMLCorpusReader): headLen = 2770 def __init__(self, *args, **kwargs): if 'textid_file' in kwargs: self._textids = kwargs['textid_file'] else: self._textids = None XMLCorpusReader.__init__(self, *args) CategorizedCorpusReader.__init__(self, kwargs) self._init_textids() def _init_textids(self): self._f2t = defaultdict(list) self._t2f = defaultdict(list) if self._textids is not None: for line in self.open(self._textids).readlines(): line = line.strip() file_id, text_ids = line.split(' ', 1) if file_id not in self.fileids(): raise ValueError('In text_id mapping file %s: %s ' 'not found' % (catfile, file_id)) for text_id in text_ids.split(self._delimiter): self._add_textids(file_id, text_id) def _add_textids(self, file_id, text_id): self._f2t[file_id].append(text_id) self._t2f[text_id].append(file_id) def _resolve(self, fileids, categories, textids=None): tmp = None if fileids is not None: if not tmp: tmp = fileids, None else: raise ValueError('Specify only fileids, categories or textids') if categories is not None: if not tmp: tmp = self.fileids(categories), None else: raise ValueError('Specify only fileids, categories or textids') if textids is not None: if not tmp: if isinstance(textids, compat.string_types): textids = [textids] files = sum((self._t2f[t] for t in textids), []) tdict = dict() for f in files: tdict[f] = (set(self._f2t[f]) & set(textids)) tmp = files, tdict else: raise ValueError('Specify only fileids, categories or textids') return None, None def decode_tag(self, tag): # to be implemented return tag def textids(self, fileids=None, categories=None): """ In the pl196x corpus each category is stored in single file and thus both methods provide identical functionality. In order to accommodate finer granularity, a non-standard textids() method was implemented. All the main functions can be supplied with a list of required chunks---giving much more control to the user. """ fileids, _ = self._resolve(fileids, categories) if fileids is None: return sorted(self._t2f) if isinstance(fileids, compat.string_types): fileids = [fileids] return sorted(sum((self._f2t[d] for d in fileids), [])) def words(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] if textids: return concat([TEICorpusView(self.abspath(fileid), False, False, False, headLen=self.headLen, textids=textids[fileid]) for fileid in fileids]) else: return concat([TEICorpusView(self.abspath(fileid), False, False, False, headLen=self.headLen) for fileid in fileids]) def sents(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] if textids: return concat([TEICorpusView(self.abspath(fileid), False, True, False, headLen=self.headLen, textids=textids[fileid]) for fileid in fileids]) else: return concat([TEICorpusView(self.abspath(fileid), False, True, False, headLen=self.headLen) for fileid in fileids]) def paras(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] if textids: return concat([TEICorpusView(self.abspath(fileid), False, True, True, headLen=self.headLen, textids=textids[fileid]) for fileid in fileids]) else: return concat([TEICorpusView(self.abspath(fileid), False, True, True, headLen=self.headLen) for fileid in fileids]) def tagged_words(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] if textids: return concat([TEICorpusView(self.abspath(fileid), True, False, False, headLen=self.headLen, textids=textids[fileid]) for fileid in fileids]) else: return concat([TEICorpusView(self.abspath(fileid), True, False, False, headLen=self.headLen) for fileid in fileids]) def tagged_sents(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] if textids: return concat([TEICorpusView(self.abspath(fileid), True, True, False, headLen=self.headLen, textids=textids[fileid]) for fileid in fileids]) else: return concat([TEICorpusView(self.abspath(fileid), True, True, False, headLen=self.headLen) for fileid in fileids]) def tagged_paras(self, fileids=None, categories=None, textids=None): fileids, textids = self._resolve(fileids, categories, textids) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] if textids: return concat([TEICorpusView(self.abspath(fileid), True, True, True, headLen=self.headLen, textids=textids[fileid]) for fileid in fileids]) else: return concat([TEICorpusView(self.abspath(fileid), True, True, True, headLen=self.headLen) for fileid in fileids]) def xml(self, fileids=None, categories=None): fileids, _ = self._resolve(fileids, categories) if len(fileids) == 1: return XMLCorpusReader.xml(self, fileids[0]) else: raise TypeError('Expected a single file') def raw(self, fileids=None, categories=None): fileids, _ = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) nltk-3.1/nltk/corpus/reader/plaintext.py0000644000076500000240000002172112607224144020170 0ustar sbstaff00000000000000# Natural Language Toolkit: Plaintext Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # Nitin Madnani # URL: # For license information, see LICENSE.TXT """ A reader for corpora that consist of plaintext documents. """ import codecs import nltk.data from nltk.compat import string_types from nltk.tokenize import * from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class PlaintextCorpusReader(CorpusReader): """ Reader for corpora that consist of plaintext documents. Paragraphs are assumed to be split using blank lines. Sentences and words can be tokenized using the default tokenizers, or by custom tokenizers specificed as parameters to the constructor. This corpus reader can be customized (e.g., to skip preface sections of specific document formats) by creating a subclass and overriding the ``CorpusView`` class variable. """ CorpusView = StreamBackedCorpusView """The corpus view class used by this reader. Subclasses of ``PlaintextCorpusReader`` may specify alternative corpus view classes (e.g., to skip the preface sections of documents.)""" def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(), sent_tokenizer=nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle'), para_block_reader=read_blankline_block, encoding='utf8'): """ Construct a new plaintext corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/usr/local/share/nltk_data/corpora/webtext/' >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param word_tokenizer: Tokenizer for breaking sentences or paragraphs into words. :param sent_tokenizer: Tokenizer for breaking paragraphs into words. :param para_block_reader: The block reader used to divide the corpus into paragraph blocks. """ CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._para_block_reader = para_block_reader def raw(self, fileids=None): """ :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat([self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ if self._sent_tokenizer is None: raise ValueError('No sentence tokenizer for this corpus') return concat([self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def paras(self, fileids=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word strings. :rtype: list(list(list(str))) """ if self._sent_tokenizer is None: raise ValueError('No sentence tokenizer for this corpus') return concat([self.CorpusView(path, self._read_para_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def _read_word_block(self, stream): words = [] for i in range(20): # Read 20 lines at a time. words.extend(self._word_tokenizer.tokenize(stream.readline())) return words def _read_sent_block(self, stream): sents = [] for para in self._para_block_reader(stream): sents.extend([self._word_tokenizer.tokenize(sent) for sent in self._sent_tokenizer.tokenize(para)]) return sents def _read_para_block(self, stream): paras = [] for para in self._para_block_reader(stream): paras.append([self._word_tokenizer.tokenize(sent) for sent in self._sent_tokenizer.tokenize(para)]) return paras class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader): """ A reader for plaintext corpora whose documents are divided into categories based on their file identifiers. """ def __init__(self, *args, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``PlaintextCorpusReader`` constructor. """ CategorizedCorpusReader.__init__(self, kwargs) PlaintextCorpusReader.__init__(self, *args, **kwargs) def _resolve(self, fileids, categories): if fileids is not None and categories is not None: raise ValueError('Specify fileids or categories, not both') if categories is not None: return self.fileids(categories) else: return fileids def raw(self, fileids=None, categories=None): return PlaintextCorpusReader.raw( self, self._resolve(fileids, categories)) def words(self, fileids=None, categories=None): return PlaintextCorpusReader.words( self, self._resolve(fileids, categories)) def sents(self, fileids=None, categories=None): return PlaintextCorpusReader.sents( self, self._resolve(fileids, categories)) def paras(self, fileids=None, categories=None): return PlaintextCorpusReader.paras( self, self._resolve(fileids, categories)) # is there a better way? class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader): def __init__(self, *args, **kwargs): CategorizedCorpusReader.__init__(self, kwargs) kwargs['sent_tokenizer'] = nltk.data.LazyLoader('tokenizers/punkt/portuguese.pickle') PlaintextCorpusReader.__init__(self, *args, **kwargs) class EuroparlCorpusReader(PlaintextCorpusReader): """ Reader for Europarl corpora that consist of plaintext documents. Documents are divided into chapters instead of paragraphs as for regular plaintext documents. Chapters are separated using blank lines. Everything is inherited from ``PlaintextCorpusReader`` except that: - Since the corpus is pre-processed and pre-tokenized, the word tokenizer should just split the line at whitespaces. - For the same reason, the sentence tokenizer should just split the paragraph at line breaks. - There is a new 'chapters()' method that returns chapters instead instead of paragraphs. - The 'paras()' method inherited from PlaintextCorpusReader is made non-functional to remove any confusion between chapters and paragraphs for Europarl. """ def _read_word_block(self, stream): words = [] for i in range(20): # Read 20 lines at a time. words.extend(stream.readline().split()) return words def _read_sent_block(self, stream): sents = [] for para in self._para_block_reader(stream): sents.extend([sent.split() for sent in para.splitlines()]) return sents def _read_para_block(self, stream): paras = [] for para in self._para_block_reader(stream): paras.append([sent.split() for sent in para.splitlines()]) return paras def chapters(self, fileids=None): """ :return: the given file(s) as a list of chapters, each encoded as a list of sentences, which are in turn encoded as lists of word strings. :rtype: list(list(list(str))) """ return concat([self.CorpusView(fileid, self._read_para_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)]) def paras(self, fileids=None): raise NotImplementedError('The Europarl corpus reader does not support paragraphs. Please use chapters() instead.') nltk-3.1/nltk/corpus/reader/ppattach.py0000644000076500000240000000610212607224144017760 0ustar sbstaff00000000000000# Natural Language Toolkit: PP Attachment Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Read lines from the Prepositional Phrase Attachment Corpus. The PP Attachment Corpus contains several files having the format: sentence_id verb noun1 preposition noun2 attachment For example: 42960 gives authority to administration V 46742 gives inventors of microchip N The PP attachment is to the verb phrase (V) or noun phrase (N), i.e.: (VP gives (NP authority) (PP to administration)) (VP gives (NP inventors (PP of microchip))) The corpus contains the following files: training: training set devset: development test set, used for algorithm development. test: test set, used to report results bitstrings: word classes derived from Mutual Information Clustering for the Wall Street Journal. Ratnaparkhi, Adwait (1994). A Maximum Entropy Model for Prepositional Phrase Attachment. Proceedings of the ARPA Human Language Technology Conference. [http://www.cis.upenn.edu/~adwait/papers/hlt94.ps] The PP Attachment Corpus is distributed with NLTK with the permission of the author. """ from __future__ import unicode_literals from nltk import compat from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * @compat.python_2_unicode_compatible class PPAttachment(object): def __init__(self, sent, verb, noun1, prep, noun2, attachment): self.sent = sent self.verb = verb self.noun1 = noun1 self.prep = prep self.noun2 = noun2 self.attachment = attachment def __repr__(self): return ('PPAttachment(sent=%r, verb=%r, noun1=%r, prep=%r, ' 'noun2=%r, attachment=%r)' % (self.sent, self.verb, self.noun1, self.prep, self.noun2, self.attachment)) class PPAttachmentCorpusReader(CorpusReader): """ sentence_id verb noun1 preposition noun2 attachment """ def attachments(self, fileids): return concat([StreamBackedCorpusView(fileid, self._read_obj_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)]) def tuples(self, fileids): return concat([StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)]) def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def _read_tuple_block(self, stream): line = stream.readline() if line: return [tuple(line.split())] else: return [] def _read_obj_block(self, stream): line = stream.readline() if line: return [PPAttachment(*line.split())] else: return [] nltk-3.1/nltk/corpus/reader/propbank.py0000644000076500000240000004267112607224144020003 0ustar sbstaff00000000000000# Natural Language Toolkit: PropBank Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals import re from xml.etree import ElementTree from nltk import compat from nltk.tree import Tree from nltk.internals import raise_unorderable_types from nltk.compat import total_ordering from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class PropbankCorpusReader(CorpusReader): """ Corpus reader for the propbank corpus, which augments the Penn Treebank with information about the predicate argument structure of every verb instance. The corpus consists of two parts: the predicate-argument annotations themselves, and a set of "frameset files" which define the argument labels used by the annotations, on a per-verb basis. Each "frameset file" contains one or more predicates, such as ``'turn'`` or ``'turn_on'``, each of which is divided into coarse-grained word senses called "rolesets". For each "roleset", the frameset file provides descriptions of the argument roles, along with examples. """ def __init__(self, root, propfile, framefiles='', verbsfile=None, parse_fileid_xform=None, parse_corpus=None, encoding='utf8'): """ :param root: The root directory for this corpus. :param propfile: The name of the file containing the predicate- argument annotations (relative to ``root``). :param framefiles: A list or regexp specifying the frameset fileids for this corpus. :param parse_fileid_xform: A transform that should be applied to the fileids in this corpus. This should be a function of one argument (a fileid) that returns a string (the new fileid). :param parse_corpus: The corpus containing the parse trees corresponding to this corpus. These parse trees are necessary to resolve the tree pointers used by propbank. """ # If framefiles is specified as a regexp, expand it. if isinstance(framefiles, compat.string_types): framefiles = find_corpus_fileids(root, framefiles) framefiles = list(framefiles) # Initialze the corpus reader. CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding) # Record our frame fileids & prop file. self._propfile = propfile self._framefiles = framefiles self._verbsfile = verbsfile self._parse_fileid_xform = parse_fileid_xform self._parse_corpus = parse_corpus def raw(self, fileids=None): """ :return: the text contents of the given fileids, as a single string. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def instances(self, baseform=None): """ :return: a corpus view that acts as a list of ``PropBankInstance`` objects, one for each noun in the corpus. """ kwargs = {} if baseform is not None: kwargs['instance_filter'] = lambda inst: inst.baseform==baseform return StreamBackedCorpusView(self.abspath(self._propfile), lambda stream: self._read_instance_block(stream, **kwargs), encoding=self.encoding(self._propfile)) def lines(self): """ :return: a corpus view that acts as a list of strings, one for each line in the predicate-argument annotation file. """ return StreamBackedCorpusView(self.abspath(self._propfile), read_line_block, encoding=self.encoding(self._propfile)) def roleset(self, roleset_id): """ :return: the xml description for the given roleset. """ baseform = roleset_id.split('.')[0] framefile = 'frames/%s.xml' % baseform if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % roleset_id) # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. etree = ElementTree.parse(self.abspath(framefile).open()).getroot() for roleset in etree.findall('predicate/roleset'): if roleset.attrib['id'] == roleset_id: return roleset else: raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile)) def rolesets(self, baseform=None): """ :return: list of xml descriptions for rolesets. """ if baseform is not None: framefile = 'frames/%s.xml' % baseform if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % baseform) framefiles = [framefile] else: framefiles = self._framefiles rsets = [] for framefile in framefiles: # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. etree = ElementTree.parse(self.abspath(framefile).open()).getroot() rsets.append(etree.findall('predicate/roleset')) return LazyConcatenation(rsets) def verbs(self): """ :return: a corpus view that acts as a list of all verb lemmas in this corpus (from the verbs.txt file). """ return StreamBackedCorpusView(self.abspath(self._verbsfile), read_line_block, encoding=self.encoding(self._verbsfile)) def _read_instance_block(self, stream, instance_filter=lambda inst: True): block = [] # Read 100 at a time. for i in range(100): line = stream.readline().strip() if line: inst = PropbankInstance.parse( line, self._parse_fileid_xform, self._parse_corpus) if instance_filter(inst): block.append(inst) return block ###################################################################### #{ Propbank Instance & related datatypes ###################################################################### @compat.python_2_unicode_compatible class PropbankInstance(object): def __init__(self, fileid, sentnum, wordnum, tagger, roleset, inflection, predicate, arguments, parse_corpus=None): self.fileid = fileid """The name of the file containing the parse tree for this instance's sentence.""" self.sentnum = sentnum """The sentence number of this sentence within ``fileid``. Indexing starts from zero.""" self.wordnum = wordnum """The word number of this instance's predicate within its containing sentence. Word numbers are indexed starting from zero, and include traces and other empty parse elements.""" self.tagger = tagger """An identifier for the tagger who tagged this instance; or ``'gold'`` if this is an adjuticated instance.""" self.roleset = roleset """The name of the roleset used by this instance's predicate. Use ``propbank.roleset() `` to look up information about the roleset.""" self.inflection = inflection """A ``PropbankInflection`` object describing the inflection of this instance's predicate.""" self.predicate = predicate """A ``PropbankTreePointer`` indicating the position of this instance's predicate within its containing sentence.""" self.arguments = tuple(arguments) """A list of tuples (argloc, argid), specifying the location and identifier for each of the predicate's argument in the containing sentence. Argument identifiers are strings such as ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain the predicate.""" self.parse_corpus = parse_corpus """A corpus reader for the parse trees corresponding to the instances in this propbank corpus.""" @property def baseform(self): """The baseform of the predicate.""" return self.roleset.split('.')[0] @property def sensenumber(self): """The sense number of the predicate.""" return self.roleset.split('.')[1] @property def predid(self): """Identifier of the predicate.""" return 'rel' def __repr__(self): return ('' % (self.fileid, self.sentnum, self.wordnum)) def __str__(self): s = '%s %s %s %s %s %s' % (self.fileid, self.sentnum, self.wordnum, self.tagger, self.roleset, self.inflection) items = self.arguments + ((self.predicate, 'rel'),) for (argloc, argid) in sorted(items): s += ' %s-%s' % (argloc, argid) return s def _get_tree(self): if self.parse_corpus is None: return None if self.fileid not in self.parse_corpus.fileids(): return None return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] tree = property(_get_tree, doc=""" The parse tree corresponding to this instance, or None if the corresponding tree is not available.""") @staticmethod def parse(s, parse_fileid_xform=None, parse_corpus=None): pieces = s.split() if len(pieces) < 7: raise ValueError('Badly formatted propbank line: %r' % s) # Divide the line into its basic pieces. (fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6] rel = [p for p in pieces[6:] if p.endswith('-rel')] args = [p for p in pieces[6:] if not p.endswith('-rel')] if len(rel) != 1: raise ValueError('Badly formatted propbank line: %r' % s) # Apply the fileid selector, if any. if parse_fileid_xform is not None: fileid = parse_fileid_xform(fileid) # Convert sentence & word numbers to ints. sentnum = int(sentnum) wordnum = int(wordnum) # Parse the inflection inflection = PropbankInflection.parse(inflection) # Parse the predicate location. predicate = PropbankTreePointer.parse(rel[0][:-4]) # Parse the arguments. arguments = [] for arg in args: argloc, argid = arg.split('-', 1) arguments.append( (PropbankTreePointer.parse(argloc), argid) ) # Put it all together. return PropbankInstance(fileid, sentnum, wordnum, tagger, roleset, inflection, predicate, arguments, parse_corpus) class PropbankPointer(object): """ A pointer used by propbank to identify one or more constituents in a parse tree. ``PropbankPointer`` is an abstract base class with three concrete subclasses: - ``PropbankTreePointer`` is used to point to single constituents. - ``PropbankSplitTreePointer`` is used to point to 'split' constituents, which consist of a sequence of two or more ``PropbankTreePointer`` pointers. - ``PropbankChainTreePointer`` is used to point to entire trace chains in a tree. It consists of a sequence of pieces, which can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers. """ def __init__(self): if self.__class__ == PropbankPointer: raise NotImplementedError() @compat.python_2_unicode_compatible class PropbankChainTreePointer(PropbankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements may be either ``PropbankSplitTreePointer`` or ``PropbankTreePointer`` pointers.""" def __str__(self): return '*'.join('%s' % p for p in self.pieces) def __repr__(self): return '' % self def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*CHAIN*', [p.select(tree) for p in self.pieces]) @compat.python_2_unicode_compatible class PropbankSplitTreePointer(PropbankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements are all ``PropbankTreePointer`` pointers.""" def __str__(self): return ','.join('%s' % p for p in self.pieces) def __repr__(self): return '' % self def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return Tree('*SPLIT*', [p.select(tree) for p in self.pieces]) @total_ordering @compat.python_2_unicode_compatible class PropbankTreePointer(PropbankPointer): """ wordnum:height*wordnum:height*... wordnum:height, """ def __init__(self, wordnum, height): self.wordnum = wordnum self.height = height @staticmethod def parse(s): # Deal with chains (xx*yy*zz) pieces = s.split('*') if len(pieces) > 1: return PropbankChainTreePointer([PropbankTreePointer.parse(elt) for elt in pieces]) # Deal with split args (xx,yy,zz) pieces = s.split(',') if len(pieces) > 1: return PropbankSplitTreePointer([PropbankTreePointer.parse(elt) for elt in pieces]) # Deal with normal pointers. pieces = s.split(':') if len(pieces) != 2: raise ValueError('bad propbank pointer %r' % s) return PropbankTreePointer(int(pieces[0]), int(pieces[1])) def __str__(self): return '%s:%s' % (self.wordnum, self.height) def __repr__(self): return 'PropbankTreePointer(%d, %d)' % (self.wordnum, self.height) def __eq__(self, other): while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, PropbankTreePointer): return self is other return (self.wordnum == other.wordnum and self.height == other.height) def __ne__(self, other): return not self == other def __lt__(self, other): while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, PropbankTreePointer): return id(self) < id(other) return (self.wordnum, -self.height) < (other.wordnum, -other.height) def select(self, tree): if tree is None: raise ValueError('Parse tree not avaialable') return tree[self.treepos(tree)] def treepos(self, tree): """ Convert this pointer to a standard 'tree position' pointer, given that it points to the given tree. """ if tree is None: raise ValueError('Parse tree not avaialable') stack = [tree] treepos = [] wordnum = 0 while True: #print treepos #print stack[-1] # tree node: if isinstance(stack[-1], Tree): # Select the next child. if len(treepos) < len(stack): treepos.append(0) else: treepos[-1] += 1 # Update the stack. if treepos[-1] < len(stack[-1]): stack.append(stack[-1][treepos[-1]]) else: # End of node's child list: pop up a level. stack.pop() treepos.pop() # word node: else: if wordnum == self.wordnum: return tuple(treepos[:len(treepos)-self.height-1]) else: wordnum += 1 stack.pop() @compat.python_2_unicode_compatible class PropbankInflection(object): #{ Inflection Form INFINITIVE = 'i' GERUND = 'g' PARTICIPLE = 'p' FINITE = 'v' #{ Inflection Tense FUTURE = 'f' PAST = 'p' PRESENT = 'n' #{ Inflection Aspect PERFECT = 'p' PROGRESSIVE = 'o' PERFECT_AND_PROGRESSIVE = 'b' #{ Inflection Person THIRD_PERSON = '3' #{ Inflection Voice ACTIVE = 'a' PASSIVE = 'p' #{ Inflection NONE = '-' #} def __init__(self, form='-', tense='-', aspect='-', person='-', voice='-'): self.form = form self.tense = tense self.aspect = aspect self.person = person self.voice = voice def __str__(self): return self.form+self.tense+self.aspect+self.person+self.voice def __repr__(self): return '' % self _VALIDATE = re.compile(r'[igpv\-][fpn\-][pob\-][3\-][ap\-]$') @staticmethod def parse(s): if not isinstance(s, compat.string_types): raise TypeError('expected a string') if (len(s) != 5 or not PropbankInflection._VALIDATE.match(s)): raise ValueError('Bad propbank inflection string %r' % s) return PropbankInflection(*s) nltk-3.1/nltk/corpus/reader/pros_cons.py0000644000076500000240000001144612607224144020170 0ustar sbstaff00000000000000# Natural Language Toolkit: Pros and Cons Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader for the Pros and Cons dataset. - Pros and Cons dataset information - Contact: Bing Liu, liub@cs.uic.edu http://www.cs.uic.edu/~liub Distributed with permission. Related papers: - Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences". Proceedings of the 22nd International Conference on Computational Linguistics (Coling-2008), Manchester, 18-22 August, 2008. - Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing Opinions on the Web". Proceedings of the 14th international World Wide Web conference (WWW-2005), May 10-14, 2005, in Chiba, Japan. """ import re from nltk.corpus.reader.api import * from nltk.tokenize import * class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader): """ Reader for the Pros and Cons sentence dataset. >>> from nltk.corpus import pros_cons >>> pros_cons.sents(categories='Cons') [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy', 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'], ...] >>> pros_cons.words('IntegratedPros.txt') ['Easy', 'to', 'use', ',', 'economical', '!', ...] """ CorpusView = StreamBackedCorpusView def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding='utf8', **kwargs): """ :param root: The root directory for the corpus. :param fileids: a list or regexp specifying the fileids in the corpus. :param word_tokenizer: a tokenizer for breaking sentences or paragraphs into words. Default: `WhitespaceTokenizer` :param encoding: the encoding that should be used to read the corpus. :param kwargs: additional parameters passed to CategorizedCorpusReader. """ CorpusReader.__init__(self, root, fileids, encoding) CategorizedCorpusReader.__init__(self, kwargs) self._word_tokenizer = word_tokenizer def sents(self, fileids=None, categories=None): """ Return all sentences in the corpus or in the specified files/categories. :param fileids: a list or regexp specifying the ids of the files whose sentences have to be returned. :param categories: a list specifying the categories whose sentences have to be returned. :return: the given file(s) as a list of sentences. Each sentence is tokenized using the specified word_tokenizer. :rtype: list(list(str)) """ fileids = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def words(self, fileids=None, categories=None): """ Return all words and punctuation symbols in the corpus or in the specified files/categories. :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :param categories: a list specifying the categories whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ fileids = self._resolve(fileids, categories) if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def _read_sent_block(self, stream): sents = [] for i in range(20): # Read 20 lines at a time. line = stream.readline() if not line: continue sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)", line) if sent: sents.append(self._word_tokenizer.tokenize(sent.group(2).strip())) return sents def _read_word_block(self, stream): words = [] for sent in self._read_sent_block(stream): words.extend(sent) return words def _resolve(self, fileids, categories): if fileids is not None and categories is not None: raise ValueError('Specify fileids or categories, not both') if categories is not None: return self.fileids(categories) else: return fileids nltk-3.1/nltk/corpus/reader/reviews.py0000644000076500000240000003021712607224144017644 0ustar sbstaff00000000000000# Natural Language Toolkit: Product Reviews Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ CorpusReader for reviews corpora (syntax based on Customer Review Corpus). - Customer Review Corpus information - Annotated by: Minqing Hu and Bing Liu, 2004. Department of Computer Sicence University of Illinois at Chicago Contact: Bing Liu, liub@cs.uic.edu http://www.cs.uic.edu/~liub Distributed with permission. The "product_reviews_1" and "product_reviews_2" datasets respectively contain annotated customer reviews of 5 and 9 products from amazon.com. Related papers: - Minqing Hu and Bing Liu. "Mining and summarizing customer reviews". Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining (KDD-04), 2004. - Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews". Proceedings of Nineteeth National Conference on Artificial Intelligence (AAAI-2004), 2004. - Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to Opinion Mining." Proceedings of First ACM International Conference on Web Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University, Stanford, California, USA. Symbols used in the annotated reviews: [t] : the title of the review: Each [t] tag starts a review. xxxx[+|-n]: xxxx is a product feature. [+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest. Note that the strength is quite subjective. You may want ignore it, but only considering + and - [-n]: Negative opinion ## : start of each sentence. Each line is a sentence. [u] : feature not appeared in the sentence. [p] : feature not appeared in the sentence. Pronoun resolution is needed. [s] : suggestion or recommendation. [cc]: comparison with a competing product from a different brand. [cs]: comparison with a competing product from the same brand. Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not provide separation between different reviews. This is due to the fact that the dataset was specifically designed for aspect/feature-based sentiment analysis, for which sentence-level annotation is sufficient. For document- level classification and analysis, this peculiarity should be taken into consideration. """ import re from nltk.corpus.reader.api import * from nltk.tokenize import * TITLE = re.compile(r'^\[t\](.*)$') # [t] Title FEATURES = re.compile(r'((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]') # find 'feature' in feature[+3] NOTES = re.compile(r'\[(?!t)(p|u|s|cc|cs)\]') # find 'p' in camera[+2][p] SENT = re.compile(r'##(.*)$') # find tokenized sentence @compat.python_2_unicode_compatible class Review(object): """ A Review is the main block of a ReviewsCorpusReader. """ def __init__(self, title=None, review_lines=None): """ :param title: the title of the review. :param review_lines: the list of the ReviewLines that belong to the Review. """ self.title = title if review_lines is None: self.review_lines = [] else: self.review_lines = review_lines def add_line(self, review_line): """ Add a line (ReviewLine) to the review. :param review_line: a ReviewLine instance that belongs to the Review. """ assert isinstance(review_line, ReviewLine) self.review_lines.append(review_line) def features(self): """ Return a list of features in the review. Each feature is a tuple made of the specific item feature and the opinion strength about that feature. :return: all features of the review as a list of tuples (feat, score). :rtype: list(tuple) """ features = [] for review_line in self.review_lines: features.extend(review_line.features) return features def sents(self): """ Return all tokenized sentences in the review. :return: all sentences of the review as lists of tokens. :rtype: list(list(str)) """ return [review_line.sent for review_line in self.review_lines] def __repr__(self): return 'Review(title=\"{}\", review_lines={})'.format(self.title, self.review_lines) @compat.python_2_unicode_compatible class ReviewLine(object): """ A ReviewLine represents a sentence of the review, together with (optional) annotations of its features and notes about the reviewed item. """ def __init__(self, sent, features=None, notes=None): self.sent = sent if features is None: self.features = [] else: self.features = features if notes is None: self.notes = [] else: self.notes = notes def __repr__(self): return ('ReviewLine(features={}, notes={}, sent={})'.format( self.features, self.notes, self.sent)) class ReviewsCorpusReader(CorpusReader): """ Reader for the Customer Review Data dataset by Hu, Liu (2004). Note: we are not applying any sentence tokenization at the moment, just word tokenization. >>> from nltk.corpus import product_reviews_1 >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt') >>> review = camera_reviews[0] >>> review.sents()[0] ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am', 'extremely', 'satisfied', 'with', 'the', 'purchase', '.'] >>> review.features() [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'), ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'), ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'), ('option', '+1')] We can also reach the same information directly from the stream: >>> product_reviews_1.features('Canon_G3.txt') [('canon powershot g3', '+3'), ('use', '+2'), ...] We can compute stats for specific product features: >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) >>> # We use float for backward compatibility with division in Python2.7 >>> mean = float(tot)/n_reviews >>> print(n_reviews, tot, mean) 15 24 1.6 """ CorpusView = StreamBackedCorpusView def __init__(self, root, fileids, word_tokenizer=WordPunctTokenizer(), encoding='utf8'): """ :param root: The root directory for the corpus. :param fileids: a list or regexp specifying the fileids in the corpus. :param word_tokenizer: a tokenizer for breaking sentences or paragraphs into words. Default: `WordPunctTokenizer` :param encoding: the encoding that should be used to read the corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer def features(self, fileids=None): """ Return a list of features. Each feature is a tuple made of the specific item feature and the opinion strength about that feature. :param fileids: a list or regexp specifying the ids of the files whose features have to be returned. :return: all features for the item(s) in the given file(s). :rtype: list(tuple) """ if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.CorpusView(fileid, self._read_features, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)]) def raw(self, fileids=None): """ :param fileids: a list or regexp specifying the fileids of the files that have to be returned as a raw string. :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def readme(self): """ Return the contents of the corpus README.txt file. """ return self.open("README.txt").read() def reviews(self, fileids=None): """ Return all the reviews as a list of Review objects. If `fileids` is specified, return all the reviews from each of the specified files. :param fileids: a list or regexp specifying the ids of the files whose reviews have to be returned. :return: the given file(s) as a list of reviews. """ if fileids is None: fileids = self._fileids return concat([self.CorpusView(fileid, self._read_review_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)]) def sents(self, fileids=None): """ Return all sentences in the corpus or in the specified files. :param fileids: a list or regexp specifying the ids of the files whose sentences have to be returned. :return: the given file(s) as a list of sentences, each encoded as a list of word strings. :rtype: list(list(str)) """ return concat([self.CorpusView(path, self._read_sent_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def words(self, fileids=None): """ Return all words and punctuation symbols in the corpus or in the specified files. :param fileids: a list or regexp specifying the ids of the files whose words have to be returned. :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat([self.CorpusView(path, self._read_word_block, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def _read_features(self, stream): features = [] for i in range(20): line = stream.readline() if not line: return features features.extend(re.findall(FEATURES, line)) return features def _read_review_block(self, stream): while True: line = stream.readline() if not line: return [] # end of file. title_match = re.match(TITLE, line) if title_match: review = Review(title=title_match.group(1).strip()) # We create a new review break # Scan until we find another line matching the regexp, or EOF. while True: oldpos = stream.tell() line = stream.readline() # End of file: if not line: return [review] # Start of a new review: backup to just before it starts, and # return the review we've already collected. if re.match(TITLE, line): stream.seek(oldpos) return [review] # Anything else is part of the review line. feats = re.findall(FEATURES, line) notes = re.findall(NOTES, line) sent = re.findall(SENT, line) if sent: sent = self._word_tokenizer.tokenize(sent[0]) review_line = ReviewLine(sent=sent, features=feats, notes=notes) review.add_line(review_line) def _read_sent_block(self, stream): sents = [] for review in self._read_review_block(stream): sents.extend([sent for sent in review.sents()]) return sents def _read_word_block(self, stream): words = [] for i in range(20): # Read 20 lines at a time. line = stream.readline() sent = re.findall(SENT, line) if sent: words.extend(self._word_tokenizer.tokenize(sent[0])) return words nltk-3.1/nltk/corpus/reader/rte.py0000644000076500000240000001122312607224144016746 0ustar sbstaff00000000000000# Natural Language Toolkit: RTE Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ Corpus reader for the Recognizing Textual Entailment (RTE) Challenge Corpora. The files were taken from the RTE1, RTE2 and RTE3 datasets and the files were regularized. Filenames are of the form rte*_dev.xml and rte*_test.xml. The latter are the gold standard annotated files. Each entailment corpus is a list of 'text'/'hypothesis' pairs. The following example is taken from RTE3:: The sale was made to pay Yukos' US$ 27.5 billion tax bill, Yuganskneftegaz was originally sold for US$ 9.4 billion to a little known company Baikalfinansgroup which was later bought by the Russian state-owned oil company Rosneft . Baikalfinansgroup was sold to Rosneft. In order to provide globally unique IDs for each pair, a new attribute ``challenge`` has been added to the root element ``entailment-corpus`` of each file, taking values 1, 2 or 3. The GID is formatted 'm-n', where 'm' is the challenge number and 'n' is the pair ID. """ from __future__ import unicode_literals from nltk import compat from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * from nltk.corpus.reader.xmldocs import * def norm(value_string): """ Normalize the string value in an RTE pair's ``value`` or ``entailment`` attribute as an integer (1, 0). :param value_string: the label used to classify a text/hypothesis pair :type value_string: str :rtype: int """ valdict = {"TRUE": 1, "FALSE": 0, "YES": 1, "NO": 0} return valdict[value_string.upper()] @compat.python_2_unicode_compatible class RTEPair(object): """ Container for RTE text-hypothesis pairs. The entailment relation is signalled by the ``value`` attribute in RTE1, and by ``entailment`` in RTE2 and RTE3. These both get mapped on to the ``entailment`` attribute of this class. """ def __init__(self, pair, challenge=None, id=None, text=None, hyp=None, value=None, task=None, length=None): """ :param challenge: version of the RTE challenge (i.e., RTE1, RTE2 or RTE3) :param id: identifier for the pair :param text: the text component of the pair :param hyp: the hypothesis component of the pair :param value: classification label for the pair :param task: attribute for the particular NLP task that the data was drawn from :param length: attribute for the length of the text of the pair """ self.challenge = challenge self.id = pair.attrib["id"] self.gid = "%s-%s" % (self.challenge, self.id) self.text = pair[0].text self.hyp = pair[1].text if "value" in pair.attrib: self.value = norm(pair.attrib["value"]) elif "entailment" in pair.attrib: self.value = norm(pair.attrib["entailment"]) else: self.value = value if "task" in pair.attrib: self.task = pair.attrib["task"] else: self.task = task if "length" in pair.attrib: self.length = pair.attrib["length"] else: self.length = length def __repr__(self): if self.challenge: return '' % (self.challenge, self.id) else: return '' % self.id class RTECorpusReader(XMLCorpusReader): """ Corpus reader for corpora in RTE challenges. This is just a wrapper around the XMLCorpusReader. See module docstring above for the expected structure of input documents. """ def _read_etree(self, doc): """ Map the XML input into an RTEPair. This uses the ``getiterator()`` method from the ElementTree package to find all the ```` elements. :param doc: a parsed XML document :rtype: list(RTEPair) """ try: challenge = doc.attrib['challenge'] except KeyError: challenge = None return [RTEPair(pair, challenge=challenge) for pair in doc.getiterator("pair")] def pairs(self, fileids): """ Build a list of RTEPairs from a RTE corpus. :param fileids: a list of RTE corpus fileids :type: list :rtype: list(RTEPair) """ if isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self._read_etree(self.xml(fileid)) for fileid in fileids]) nltk-3.1/nltk/corpus/reader/semcor.py0000644000076500000240000002514212607224144017451 0ustar sbstaff00000000000000# Natural Language Toolkit: SemCor Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Nathan Schneider # URL: # For license information, see LICENSE.TXT """ Corpus reader for the SemCor Corpus. """ from __future__ import absolute_import, unicode_literals __docformat__ = 'epytext en' from nltk.corpus.reader.api import * from nltk.corpus.reader.xmldocs import XMLCorpusReader, XMLCorpusView from nltk.tree import Tree class SemcorCorpusReader(XMLCorpusReader): """ Corpus reader for the SemCor Corpus. For access to the complete XML data structure, use the ``xml()`` method. For access to simple word lists and tagged word lists, use ``words()``, ``sents()``, ``tagged_words()``, and ``tagged_sents()``. """ def __init__(self, root, fileids, wordnet, lazy=True): XMLCorpusReader.__init__(self, root, fileids) self._lazy = lazy self._wordnet = wordnet def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return self._items(fileids, 'word', False, False, False) def chunks(self, fileids=None): """ :return: the given file(s) as a list of chunks, each of which is a list of words and punctuation symbols that form a unit. :rtype: list(list(str)) """ return self._items(fileids, 'chunk', False, False, False) def tagged_chunks(self, fileids=None, tag=('pos' or 'sem' or 'both')): """ :return: the given file(s) as a list of tagged chunks, represented in tree form. :rtype: list(Tree) :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` to indicate the kind of tags to include. Semantic tags consist of WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity without a specific entry in WordNet. (Named entities of type 'other' have no lemma. Other chunks not in WordNet have no semantic tag. Punctuation tokens have `None` for their part of speech tag.) """ return self._items(fileids, 'chunk', False, tag!='sem', tag!='pos') def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of word strings. :rtype: list(list(str)) """ return self._items(fileids, 'word', True, False, False) def chunk_sents(self, fileids=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of chunks. :rtype: list(list(list(str))) """ return self._items(fileids, 'chunk', True, False, False) def tagged_sents(self, fileids=None, tag=('pos' or 'sem' or 'both')): """ :return: the given file(s) as a list of sentences. Each sentence is represented as a list of tagged chunks (in tree form). :rtype: list(list(Tree)) :param tag: `'pos'` (part of speech), `'sem'` (semantic), or `'both'` to indicate the kind of tags to include. Semantic tags consist of WordNet lemma IDs, plus an `'NE'` node if the chunk is a named entity without a specific entry in WordNet. (Named entities of type 'other' have no lemma. Other chunks not in WordNet have no semantic tag. Punctuation tokens have `None` for their part of speech tag.) """ return self._items(fileids, 'chunk', True, tag!='sem', tag!='pos') def _items(self, fileids, unit, bracket_sent, pos_tag, sem_tag): if unit=='word' and not bracket_sent: # the result of the SemcorWordView may be a multiword unit, so the # LazyConcatenation will make sure the sentence is flattened _ = lambda *args: LazyConcatenation((SemcorWordView if self._lazy else self._words)(*args)) else: _ = SemcorWordView if self._lazy else self._words return concat([_(fileid, unit, bracket_sent, pos_tag, sem_tag, self._wordnet) for fileid in self.abspaths(fileids)]) def _words(self, fileid, unit, bracket_sent, pos_tag, sem_tag): """ Helper used to implement the view methods -- returns a list of tokens, (segmented) words, chunks, or sentences. The tokens and chunks may optionally be tagged (with POS and sense information). :param fileid: The name of the underlying file. :param unit: One of `'token'`, `'word'`, or `'chunk'`. :param bracket_sent: If true, include sentence bracketing. :param pos_tag: Whether to include part-of-speech tags. :param sem_tag: Whether to include semantic tags, namely WordNet lemma and OOV named entity status. """ assert unit in ('token', 'word', 'chunk') result = [] xmldoc = ElementTree.parse(fileid).getroot() for xmlsent in xmldoc.findall('.//s'): sent = [] for xmlword in _all_xmlwords_in(xmlsent): itm = SemcorCorpusReader._word(xmlword, unit, pos_tag, sem_tag, self._wordnet) if unit=='word': sent.extend(itm) else: sent.append(itm) if bracket_sent: result.append(SemcorSentence(xmlsent.attrib['snum'], sent)) else: result.extend(sent) assert None not in result return result @staticmethod def _word(xmlword, unit, pos_tag, sem_tag, wordnet): tkn = xmlword.text if not tkn: tkn = "" # fixes issue 337? lemma = xmlword.get('lemma', tkn) # lemma or NE class lexsn = xmlword.get('lexsn') # lex_sense (locator for the lemma's sense) if lexsn is not None: sense_key = lemma + '%' + lexsn wnpos = ('n','v','a','r','s')[int(lexsn.split(':')[0])-1] # see http://wordnet.princeton.edu/man/senseidx.5WN.html else: sense_key = wnpos = None redef = xmlword.get('rdf', tkn) # redefinition--this indicates the lookup string # does not exactly match the enclosed string, e.g. due to typographical adjustments # or discontinuity of a multiword expression. If a redefinition has occurred, # the "rdf" attribute holds its inflected form and "lemma" holds its lemma. # For NEs, "rdf", "lemma", and "pn" all hold the same value (the NE class). sensenum = xmlword.get('wnsn') # WordNet sense number isOOVEntity = 'pn' in xmlword.keys() # a "personal name" (NE) not in WordNet pos = xmlword.get('pos') # part of speech for the whole chunk (None for punctuation) if unit=='token': if not pos_tag and not sem_tag: itm = tkn else: itm = (tkn,) + ((pos,) if pos_tag else ()) + ((lemma, wnpos, sensenum, isOOVEntity) if sem_tag else ()) return itm else: ww = tkn.split('_') # TODO: case where punctuation intervenes in MWE if unit=='word': return ww else: if sensenum is not None: try: sense = wordnet.lemma_from_key(sense_key) # Lemma object except Exception: # cannot retrieve the wordnet.Lemma object. possible reasons: # (a) the wordnet corpus is not downloaded; # (b) a nonexistant sense is annotated: e.g., such.s.00 triggers: # nltk.corpus.reader.wordnet.WordNetError: No synset found for key u'such%5:00:01:specified:00' # solution: just use the lemma name as a string try: sense = '%s.%s.%02d' % (lemma, wnpos, int(sensenum)) # e.g.: reach.v.02 except ValueError: sense = lemma+'.'+wnpos+'.'+sensenum # e.g. the sense number may be "2;1" bottom = [Tree(pos, ww)] if pos_tag else ww if sem_tag and isOOVEntity: if sensenum is not None: return Tree(sense, [Tree('NE', bottom)]) else: # 'other' NE return Tree('NE', bottom) elif sem_tag and sensenum is not None: return Tree(sense, bottom) elif pos_tag: return bottom[0] else: return bottom # chunk as a list def _all_xmlwords_in(elt, result=None): if result is None: result = [] for child in elt: if child.tag in ('wf', 'punc'): result.append(child) else: _all_xmlwords_in(child, result) return result class SemcorSentence(list): """ A list of words, augmented by an attribute ``num`` used to record the sentence identifier (the ``n`` attribute from the XML). """ def __init__(self, num, items): self.num = num list.__init__(self, items) class SemcorWordView(XMLCorpusView): """ A stream backed corpus view specialized for use with the BNC corpus. """ def __init__(self, fileid, unit, bracket_sent, pos_tag, sem_tag, wordnet): """ :param fileid: The name of the underlying file. :param unit: One of `'token'`, `'word'`, or `'chunk'`. :param bracket_sent: If true, include sentence bracketing. :param pos_tag: Whether to include part-of-speech tags. :param sem_tag: Whether to include semantic tags, namely WordNet lemma and OOV named entity status. """ if bracket_sent: tagspec = '.*/s' else: tagspec = '.*/s/(punc|wf)' self._unit = unit self._sent = bracket_sent self._pos_tag = pos_tag self._sem_tag = sem_tag self._wordnet = wordnet XMLCorpusView.__init__(self, fileid, tagspec) def handle_elt(self, elt, context): if self._sent: return self.handle_sent(elt) else: return self.handle_word(elt) def handle_word(self, elt): return SemcorCorpusReader._word(elt, self._unit, self._pos_tag, self._sem_tag, self._wordnet) def handle_sent(self, elt): sent = [] for child in elt: if child.tag in ('wf','punc'): itm = self.handle_word(child) if self._unit=='word': sent.extend(itm) else: sent.append(itm) else: raise ValueError('Unexpected element %s' % child.tag) return SemcorSentence(elt.attrib['snum'], sent) nltk-3.1/nltk/corpus/reader/senseval.py0000644000076500000240000001721212607224144020000 0ustar sbstaff00000000000000# Natural Language Toolkit: Senseval 2 Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # Steven Bird (modifications) # URL: # For license information, see LICENSE.TXT """ Read from the Senseval 2 Corpus. SENSEVAL [http://www.senseval.org/] Evaluation exercises for Word Sense Disambiguation. Organized by ACL-SIGLEX [http://www.siglex.org/] Prepared by Ted Pedersen , University of Minnesota, http://www.d.umn.edu/~tpederse/data.html Distributed with permission. The NLTK version of the Senseval 2 files uses well-formed XML. Each instance of the ambiguous words "hard", "interest", "line", and "serve" is tagged with a sense identifier, and supplied with context. """ from __future__ import print_function, unicode_literals import re from xml.etree import ElementTree from nltk import compat from nltk.tokenize import * from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * @compat.python_2_unicode_compatible class SensevalInstance(object): def __init__(self, word, position, context, senses): self.word = word self.senses = tuple(senses) self.position = position self.context = context def __repr__(self): return ('SensevalInstance(word=%r, position=%r, ' 'context=%r, senses=%r)' % (self.word, self.position, self.context, self.senses)) class SensevalCorpusReader(CorpusReader): def instances(self, fileids=None): return concat([SensevalCorpusView(fileid, enc) for (fileid, enc) in self.abspaths(fileids, True)]) def raw(self, fileids=None): """ :return: the text contents of the given fileids, as a single string. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def _entry(self, tree): elts = [] for lexelt in tree.findall('lexelt'): for inst in lexelt.findall('instance'): sense = inst[0].attrib['senseid'] context = [(w.text, w.attrib['pos']) for w in inst[1]] elts.append( (sense, context) ) return elts class SensevalCorpusView(StreamBackedCorpusView): def __init__(self, fileid, encoding): StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) self._word_tokenizer = WhitespaceTokenizer() self._lexelt_starts = [0] # list of streampos self._lexelts = [None] # list of lexelt names def read_block(self, stream): # Decide which lexical element we're in. lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1 lexelt = self._lexelts[lexelt_num] instance_lines = [] in_instance = False while True: line = stream.readline() if line == '': assert instance_lines == [] return [] # Start of a lexical element? if line.lstrip().startswith(' has no 'item=...' lexelt = m.group(1)[1:-1] if lexelt_num < len(self._lexelts): assert lexelt == self._lexelts[lexelt_num] else: self._lexelts.append(lexelt) self._lexelt_starts.append(stream.tell()) # Start of an instance? if line.lstrip().startswith('' elif cword.tag == 'wf': context.append((cword.text, cword.attrib['pos'])) elif cword.tag == 's': pass # Sentence boundary marker. else: print('ACK', cword.tag) assert False, 'expected CDATA or or ' if cword.tail: context += self._word_tokenizer.tokenize(cword.tail) else: assert False, 'unexpected tag %s' % child.tag return SensevalInstance(lexelt, position, context, senses) def _fixXML(text): """ Fix the various issues with Senseval pseudo-XML. """ # <~> or <^> => ~ or ^ text = re.sub(r'<([~\^])>', r'\1', text) # fix lone & text = re.sub(r'(\s+)\&(\s+)', r'\1&\2', text) # fix """ text = re.sub(r'"""', '\'"\'', text) # fix => text = re.sub(r'(<[^<]*snum=)([^">]+)>', r'\1"\2"/>', text) # fix foreign word tag text = re.sub(r'<\&frasl>\s*]*>', 'FRASL', text) # remove <&I .> text = re.sub(r'<\&I[^>]*>', '', text) # fix <{word}> text = re.sub(r'<{([^}]+)}>', r'\1', text) # remove <@>,

    ,

    text = re.sub(r'<(@|/?p)>', r'', text) # remove <&M .> and <&T .> and <&Ms .> text = re.sub(r'<&\w+ \.>', r'', text) # remove lines text = re.sub(r']*>', r'', text) # remove <[hi]> and <[/p]> etc text = re.sub(r'<\[\/?[^>]+\]*>', r'', text) # take the thing out of the brackets: <…> text = re.sub(r'<(\&\w+;)>', r'\1', text) # and remove the & for those patterns that aren't regular XML text = re.sub(r'&(?!amp|gt|lt|apos|quot)', r'', text) # fix 'abc ' style tags - now abc text = re.sub(r'[ \t]*([^<>\s]+?)[ \t]*', r' \1', text) text = re.sub(r'\s*"\s*', " \"", text) return text nltk-3.1/nltk/corpus/reader/sentiwordnet.py0000644000076500000240000001070012607224144020700 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: WordNet # # Copyright (C) 2001-2015 NLTK Project # Author: Christopher Potts # URL: # For license information, see LICENSE.TXT """ An NLTK interface for SentiWordNet SentiWordNet is a lexical resource for opinion mining. SentiWordNet assigns to each synset of WordNet three sentiment scores: positivity, negativity, and objectivity. For details about SentiWordNet see: http://sentiwordnet.isti.cnr.it/ >>> from nltk.corpus import sentiwordnet as swn >>> print(swn.senti_synset('breakdown.n.03')) >>> list(swn.senti_synsets('slow')) [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\ SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\ SentiSynset('slow.a.02'), SentiSynset('slow.a.04'),\ SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')] >>> happy = swn.senti_synsets('happy', 'a') >>> happy0 = list(happy)[0] >>> happy0.pos_score() 0.875 >>> happy0.neg_score() 0.0 >>> happy0.obj_score() 0.125 """ import re from nltk.compat import python_2_unicode_compatible from nltk.corpus.reader import CorpusReader @python_2_unicode_compatible class SentiWordNetCorpusReader(CorpusReader): def __init__(self, root, fileids, encoding='utf-8'): """ Construct a new SentiWordNet Corpus Reader, using data from the specified file. """ super(SentiWordNetCorpusReader, self).__init__(root, fileids, encoding=encoding) if len(self._fileids) != 1: raise ValueError('Exactly one file must be specified') self._db = {} self._parse_src_file() def _parse_src_file(self): lines = self.open(self._fileids[0]).read().splitlines() lines = filter((lambda x : not re.search(r"^\s*#", x)), lines) for i, line in enumerate(lines): fields = [field.strip() for field in re.split(r"\t+", line)] try: pos, offset, pos_score, neg_score, synset_terms, gloss = fields except: raise ValueError('Line %s formatted incorrectly: %s\n' % (i, line)) if pos and offset: offset = int(offset) self._db[(pos, offset)] = (float(pos_score), float(neg_score)) def senti_synset(self, *vals): from nltk.corpus import wordnet as wn if tuple(vals) in self._db: pos_score, neg_score = self._db[tuple(vals)] pos, offset = vals synset = wn._synset_from_pos_and_offset(pos, offset) return SentiSynset(pos_score, neg_score, synset) else: synset = wn.synset(vals[0]) pos = synset.pos() offset = synset.offset() if (pos, offset) in self._db: pos_score, neg_score = self._db[(pos, offset)] return SentiSynset(pos_score, neg_score, synset) else: return None def senti_synsets(self, string, pos=None): from nltk.corpus import wordnet as wn sentis = [] synset_list = wn.synsets(string, pos) for synset in synset_list: sentis.append(self.senti_synset(synset.name())) sentis = filter(lambda x : x, sentis) return sentis def all_senti_synsets(self): from nltk.corpus import wordnet as wn for key, fields in self._db.items(): pos, offset = key pos_score, neg_score = fields synset = wn._synset_from_pos_and_offset(pos, offset) yield SentiSynset(pos_score, neg_score, synset) @python_2_unicode_compatible class SentiSynset(object): def __init__(self, pos_score, neg_score, synset): self._pos_score = pos_score self._neg_score = neg_score self._obj_score = 1.0 - (self._pos_score + self._neg_score) self.synset = synset def pos_score(self): return self._pos_score def neg_score(self): return self._neg_score def obj_score(self): return self._obj_score def __str__(self): """Prints just the Pos/Neg scores for now.""" s = "<" s += self.synset.name() + ": " s += "PosScore=%s " % self._pos_score s += "NegScore=%s" % self._neg_score s += ">" return s def __repr__(self): return "Senti" + repr(self.synset) nltk-3.1/nltk/corpus/reader/sinica_treebank.py0000644000076500000240000000461012607224144021277 0ustar sbstaff00000000000000# Natural Language Toolkit: Sinica Treebank Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ Sinica Treebank Corpus Sample http://rocling.iis.sinica.edu.tw/CKIP/engversion/treebank.htm 10,000 parsed sentences, drawn from the Academia Sinica Balanced Corpus of Modern Chinese. Parse tree notation is based on Information-based Case Grammar. Tagset documentation is available at http://www.sinica.edu.tw/SinicaCorpus/modern_e_wordtype.html Language and Knowledge Processing Group, Institute of Information Science, Academia Sinica It is distributed with the Natural Language Toolkit under the terms of the Creative Commons Attribution-NonCommercial-ShareAlike License [http://creativecommons.org/licenses/by-nc-sa/2.5/]. References: Feng-Yi Chen, Pi-Fang Tsai, Keh-Jiann Chen, and Chu-Ren Huang (1999) The Construction of Sinica Treebank. Computational Linguistics and Chinese Language Processing, 4, pp 87-104. Huang Chu-Ren, Keh-Jiann Chen, Feng-Yi Chen, Keh-Jiann Chen, Zhao-Ming Gao, and Kuang-Yu Chen. 2000. Sinica Treebank: Design Criteria, Annotation Guidelines, and On-line Interface. Proceedings of 2nd Chinese Language Processing Workshop, Association for Computational Linguistics. Chen Keh-Jiann and Yu-Ming Hsieh (2004) Chinese Treebanks and Grammar Extraction, Proceedings of IJCNLP-04, pp560-565. """ import os import re from nltk.tree import sinica_parse from nltk.tag import map_tag from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * IDENTIFIER = re.compile(r'^#\S+\s') APPENDIX = re.compile(r'(?<=\))#.*$') TAGWORD = re.compile(r':([^:()|]+):([^:()|]+)') WORD = re.compile(r':[^:()|]+:([^:()|]+)') class SinicaTreebankCorpusReader(SyntaxCorpusReader): """ Reader for the sinica treebank. """ def _read_block(self, stream): sent = stream.readline() sent = IDENTIFIER.sub('', sent) sent = APPENDIX.sub('', sent) return [sent] def _parse(self, sent): return sinica_parse(sent) def _tag(self, sent, tagset=None): tagged_sent = [(w,t) for (t,w) in TAGWORD.findall(sent)] if tagset and tagset != self._tagset: tagged_sent = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in tagged_sent] return tagged_sent def _word(self, sent): return WORD.findall(sent) nltk-3.1/nltk/corpus/reader/string_category.py0000644000076500000240000000422612607224144021364 0ustar sbstaff00000000000000# Natural Language Toolkit: String Category Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ Read tuples from a corpus consisting of categorized strings. For example, from the question classification corpus: NUM:dist How far is it from Denver to Aspen ? LOC:city What county is Modesto , California in ? HUM:desc Who was Galileo ? DESC:def What is an atom ? NUM:date When did Hawaii become a state ? """ # based on PPAttachmentCorpusReader from nltk import compat from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * # [xx] Should the order of the tuple be reversed -- in most other places # in nltk, we use the form (data, tag) -- e.g., tagged words and # labeled texts for classifiers. class StringCategoryCorpusReader(CorpusReader): def __init__(self, root, fileids, delimiter=' ', encoding='utf8'): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param delimiter: Field delimiter """ CorpusReader.__init__(self, root, fileids, encoding) self._delimiter = delimiter def tuples(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([StreamBackedCorpusView(fileid, self._read_tuple_block, encoding=enc) for (fileid, enc) in self.abspaths(fileids, True)]) def raw(self, fileids=None): """ :return: the text contents of the given fileids, as a single string. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def _read_tuple_block(self, stream): line = stream.readline().strip() if line: return [tuple(line.split(self._delimiter, 1))] else: return [] nltk-3.1/nltk/corpus/reader/switchboard.py0000644000076500000240000001107012607224144020465 0ustar sbstaff00000000000000# Natural Language Toolkit: Switchboard Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals import re from nltk.tag import str2tuple, map_tag from nltk import compat from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * @compat.python_2_unicode_compatible class SwitchboardTurn(list): """ A specialized list object used to encode switchboard utterances. The elements of the list are the words in the utterance; and two attributes, ``speaker`` and ``id``, are provided to retrieve the spearker identifier and utterance id. Note that utterance ids are only unique within a given discourse. """ def __init__(self, words, speaker, id): list.__init__(self, words) self.speaker = speaker self.id = int(id) def __repr__(self): if len(self) == 0: text = '' elif isinstance(self[0], tuple): text = ' '.join('%s/%s' % w for w in self) else: text = ' '.join(self) return '<%s.%s: %r>' % (self.speaker, self.id, text) class SwitchboardCorpusReader(CorpusReader): _FILES = ['tagged'] # Use the "tagged" file even for non-tagged data methods, since # it's tokenized. def __init__(self, root, tagset=None): CorpusReader.__init__(self, root, self._FILES) self._tagset = tagset def words(self): return StreamBackedCorpusView(self.abspath('tagged'), self._words_block_reader) def tagged_words(self, tagset=None): def tagged_words_block_reader(stream): return self._tagged_words_block_reader(stream, tagset) return StreamBackedCorpusView(self.abspath('tagged'), tagged_words_block_reader) def turns(self): return StreamBackedCorpusView(self.abspath('tagged'), self._turns_block_reader) def tagged_turns(self, tagset=None): def tagged_turns_block_reader(stream): return self._tagged_turns_block_reader(stream, tagset) return StreamBackedCorpusView(self.abspath('tagged'), tagged_turns_block_reader) def discourses(self): return StreamBackedCorpusView(self.abspath('tagged'), self._discourses_block_reader) def tagged_discourses(self, tagset=False): def tagged_discourses_block_reader(stream): return self._tagged_discourses_block_reader(stream, tagset) return StreamBackedCorpusView(self.abspath('tagged'), tagged_discourses_block_reader) def _discourses_block_reader(self, stream): # returns at most 1 discourse. (The other methods depend on this.) return [[self._parse_utterance(u, include_tag=False) for b in read_blankline_block(stream) for u in b.split('\n') if u.strip()]] def _tagged_discourses_block_reader(self, stream, tagset=None): # returns at most 1 discourse. (The other methods depend on this.) return [[self._parse_utterance(u, include_tag=True, tagset=tagset) for b in read_blankline_block(stream) for u in b.split('\n') if u.strip()]] def _turns_block_reader(self, stream): return self._discourses_block_reader(stream)[0] def _tagged_turns_block_reader(self, stream, tagset=None): return self._tagged_discourses_block_reader(stream, tagset)[0] def _words_block_reader(self, stream): return sum(self._discourses_block_reader(stream)[0], []) def _tagged_words_block_reader(self, stream, tagset=None): return sum(self._tagged_discourses_block_reader(stream, tagset)[0], []) _UTTERANCE_RE = re.compile('(\w+)\.(\d+)\:\s*(.*)') _SEP = '/' def _parse_utterance(self, utterance, include_tag, tagset=None): m = self._UTTERANCE_RE.match(utterance) if m is None: raise ValueError('Bad utterance %r' % utterance) speaker, id, text = m.groups() words = [str2tuple(s, self._SEP) for s in text.split()] if not include_tag: words = [w for (w,t) in words] elif tagset and tagset != self._tagset: words = [(w, map_tag(self._tagset, tagset, t)) for (w,t) in words] return SwitchboardTurn(words, speaker, id) nltk-3.1/nltk/corpus/reader/tagged.py0000644000076500000240000003046012607224144017413 0ustar sbstaff00000000000000# Natural Language Toolkit: Tagged Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # Jacob Perkins # URL: # For license information, see LICENSE.TXT """ A reader for corpora whose documents contain part-of-speech-tagged words. """ import os from nltk import compat from nltk.tag import str2tuple, map_tag from nltk.tokenize import * from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.corpus.reader.timit import read_timit_block class TaggedCorpusReader(CorpusReader): """ Reader for simple part-of-speech tagged corpora. Paragraphs are assumed to be split using blank lines. Sentences and words can be tokenized using the default tokenizers, or by custom tokenizers specified as parameters to the constructor. Words are parsed using ``nltk.tag.str2tuple``. By default, ``'/'`` is used as the separator. I.e., words should have the form:: word1/tag1 word2/tag2 word3/tag3 ... But custom separators may be specified as parameters to the constructor. Part of speech tags are case-normalized to upper case. """ def __init__(self, root, fileids, sep='/', word_tokenizer=WhitespaceTokenizer(), sent_tokenizer=RegexpTokenizer('\n', gaps=True), para_block_reader=read_blankline_block, encoding='utf8', tagset=None): """ Construct a new Tagged Corpus reader for a set of documents located at the given root directory. Example usage: >>> root = '/...path to corpus.../' >>> reader = TaggedCorpusReader(root, '.*', '.txt') # doctest: +SKIP :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. """ CorpusReader.__init__(self, root, fileids, encoding) self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._para_block_reader = para_block_reader self._tagset = tagset def raw(self, fileids=None): """ :return: the given file(s) as a single string. :rtype: str """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def words(self, fileids=None): """ :return: the given file(s) as a list of words and punctuation symbols. :rtype: list(str) """ return concat([TaggedCorpusView(fileid, enc, False, False, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, None) for (fileid, enc) in self.abspaths(fileids, True)]) def sents(self, fileids=None): """ :return: the given file(s) as a list of sentences or utterances, each encoded as a list of word strings. :rtype: list(list(str)) """ return concat([TaggedCorpusView(fileid, enc, False, True, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, None) for (fileid, enc) in self.abspaths(fileids, True)]) def paras(self, fileids=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of word strings. :rtype: list(list(list(str))) """ return concat([TaggedCorpusView(fileid, enc, False, True, True, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, None) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_words(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of tagged words and punctuation symbols, encoded as tuples ``(word,tag)``. :rtype: list(tuple(str,str)) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat([TaggedCorpusView(fileid, enc, True, False, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_sents(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of sentences, each encoded as a list of ``(word,tag)`` tuples. :rtype: list(list(tuple(str,str))) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat([TaggedCorpusView(fileid, enc, True, True, False, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True)]) def tagged_paras(self, fileids=None, tagset=None): """ :return: the given file(s) as a list of paragraphs, each encoded as a list of sentences, which are in turn encoded as lists of ``(word,tag)`` tuples. :rtype: list(list(list(tuple(str,str)))) """ if tagset and tagset != self._tagset: tag_mapping_function = lambda t: map_tag(self._tagset, tagset, t) else: tag_mapping_function = None return concat([TaggedCorpusView(fileid, enc, True, True, True, self._sep, self._word_tokenizer, self._sent_tokenizer, self._para_block_reader, tag_mapping_function) for (fileid, enc) in self.abspaths(fileids, True)]) class CategorizedTaggedCorpusReader(CategorizedCorpusReader, TaggedCorpusReader): """ A reader for part-of-speech tagged corpora whose documents are divided into categories based on their file identifiers. """ def __init__(self, *args, **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``TaggedCorpusReader``. """ CategorizedCorpusReader.__init__(self, kwargs) TaggedCorpusReader.__init__(self, *args, **kwargs) def _resolve(self, fileids, categories): if fileids is not None and categories is not None: raise ValueError('Specify fileids or categories, not both') if categories is not None: return self.fileids(categories) else: return fileids def raw(self, fileids=None, categories=None): return TaggedCorpusReader.raw( self, self._resolve(fileids, categories)) def words(self, fileids=None, categories=None): return TaggedCorpusReader.words( self, self._resolve(fileids, categories)) def sents(self, fileids=None, categories=None): return TaggedCorpusReader.sents( self, self._resolve(fileids, categories)) def paras(self, fileids=None, categories=None): return TaggedCorpusReader.paras( self, self._resolve(fileids, categories)) def tagged_words(self, fileids=None, categories=None, tagset=None): return TaggedCorpusReader.tagged_words( self, self._resolve(fileids, categories), tagset) def tagged_sents(self, fileids=None, categories=None, tagset=None): return TaggedCorpusReader.tagged_sents( self, self._resolve(fileids, categories), tagset) def tagged_paras(self, fileids=None, categories=None, tagset=None): return TaggedCorpusReader.tagged_paras( self, self._resolve(fileids, categories), tagset) class TaggedCorpusView(StreamBackedCorpusView): """ A specialized corpus view for tagged documents. It can be customized via flags to divide the tagged corpus documents up by sentence or paragraph, and to include or omit part of speech tags. ``TaggedCorpusView`` objects are typically created by ``TaggedCorpusReader`` (not directly by nltk users). """ def __init__(self, corpus_file, encoding, tagged, group_by_sent, group_by_para, sep, word_tokenizer, sent_tokenizer, para_block_reader, tag_mapping_function=None): self._tagged = tagged self._group_by_sent = group_by_sent self._group_by_para = group_by_para self._sep = sep self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._para_block_reader = para_block_reader self._tag_mapping_function = tag_mapping_function StreamBackedCorpusView.__init__(self, corpus_file, encoding=encoding) def read_block(self, stream): """Reads one paragraph at a time.""" block = [] for para_str in self._para_block_reader(stream): para = [] for sent_str in self._sent_tokenizer.tokenize(para_str): sent = [str2tuple(s, self._sep) for s in self._word_tokenizer.tokenize(sent_str)] if self._tag_mapping_function: sent = [(w, self._tag_mapping_function(t)) for (w,t) in sent] if not self._tagged: sent = [w for (w,t) in sent] if self._group_by_sent: para.append(sent) else: para.extend(sent) if self._group_by_para: block.append(para) else: block.extend(para) return block # needs to implement simplified tags class MacMorphoCorpusReader(TaggedCorpusReader): """ A corpus reader for the MAC_MORPHO corpus. Each line contains a single tagged word, using '_' as a separator. Sentence boundaries are based on the end-sentence tag ('_.'). Paragraph information is not included in the corpus, so each paragraph returned by ``self.paras()`` and ``self.tagged_paras()`` contains a single sentence. """ def __init__(self, root, fileids, encoding='utf8', tagset=None): TaggedCorpusReader.__init__( self, root, fileids, sep='_', word_tokenizer=LineTokenizer(), sent_tokenizer=RegexpTokenizer('.*\n'), para_block_reader=self._read_block, encoding=encoding, tagset=tagset) def _read_block(self, stream): return read_regexp_block(stream, r'.*', r'.*_\.') class TimitTaggedCorpusReader(TaggedCorpusReader): """ A corpus reader for tagged sentences that are included in the TIMIT corpus. """ def __init__(self, *args, **kwargs): TaggedCorpusReader.__init__( self, para_block_reader=read_timit_block, *args, **kwargs) def paras(self): raise NotImplementedError('use sents() instead') def tagged_paras(self): raise NotImplementedError('use tagged_sents() instead') nltk-3.1/nltk/corpus/reader/timit.py0000644000076500000240000004107412574600335017314 0ustar sbstaff00000000000000# Natural Language Toolkit: TIMIT Corpus Reader # # Copyright (C) 2001-2007 NLTK Project # Author: Haejoong Lee # Steven Bird # Jacob Perkins # URL: # For license information, see LICENSE.TXT # [xx] this docstring is out-of-date: """ Read tokens, phonemes and audio data from the NLTK TIMIT Corpus. This corpus contains selected portion of the TIMIT corpus. - 16 speakers from 8 dialect regions - 1 male and 1 female from each dialect region - total 130 sentences (10 sentences per speaker. Note that some sentences are shared among other speakers, especially sa1 and sa2 are spoken by all speakers.) - total 160 recording of sentences (10 recordings per speaker) - audio format: NIST Sphere, single channel, 16kHz sampling, 16 bit sample, PCM encoding Module contents =============== The timit corpus reader provides 4 functions and 4 data items. - utterances List of utterances in the corpus. There are total 160 utterances, each of which corresponds to a unique utterance of a speaker. Here's an example of an utterance identifier in the list:: dr1-fvmh0/sx206 - _---- _--- | | | | | | | | | | | | | | `--- sentence number | | | `----- sentence type (a:all, i:shared, x:exclusive) | | `--------- speaker ID | `------------ sex (m:male, f:female) `-------------- dialect region (1..8) - speakers List of speaker IDs. An example of speaker ID:: dr1-fvmh0 Note that if you split an item ID with colon and take the first element of the result, you will get a speaker ID. >>> itemid = 'dr1-fvmh0/sx206' >>> spkrid , sentid = itemid.split('/') >>> spkrid 'dr1-fvmh0' The second element of the result is a sentence ID. - dictionary() Phonetic dictionary of words contained in this corpus. This is a Python dictionary from words to phoneme lists. - spkrinfo() Speaker information table. It's a Python dictionary from speaker IDs to records of 10 fields. Speaker IDs the same as the ones in timie.speakers. Each record is a dictionary from field names to values, and the fields are as follows:: id speaker ID as defined in the original TIMIT speaker info table sex speaker gender (M:male, F:female) dr speaker dialect region (1:new england, 2:northern, 3:north midland, 4:south midland, 5:southern, 6:new york city, 7:western, 8:army brat (moved around)) use corpus type (TRN:training, TST:test) in this sample corpus only TRN is available recdate recording date birthdate speaker birth date ht speaker height race speaker race (WHT:white, BLK:black, AMR:american indian, SPN:spanish-american, ORN:oriental,???:unknown) edu speaker education level (HS:high school, AS:associate degree, BS:bachelor's degree (BS or BA), MS:master's degree (MS or MA), PHD:doctorate degree (PhD,JD,MD), ??:unknown) comments comments by the recorder The 4 functions are as follows. - tokenized(sentences=items, offset=False) Given a list of items, returns an iterator of a list of word lists, each of which corresponds to an item (sentence). If offset is set to True, each element of the word list is a tuple of word(string), start offset and end offset, where offset is represented as a number of 16kHz samples. - phonetic(sentences=items, offset=False) Given a list of items, returns an iterator of a list of phoneme lists, each of which corresponds to an item (sentence). If offset is set to True, each element of the phoneme list is a tuple of word(string), start offset and end offset, where offset is represented as a number of 16kHz samples. - audiodata(item, start=0, end=None) Given an item, returns a chunk of audio samples formatted into a string. When the fuction is called, if start and end are omitted, the entire samples of the recording will be returned. If only end is omitted, samples from the start offset to the end of the recording will be returned. - play(data) Play the given audio samples. The audio samples can be obtained from the timit.audiodata function. """ from __future__ import print_function, unicode_literals import sys import os import re import tempfile import time from nltk import compat from nltk.tree import Tree from nltk.internals import import_from_stdlib from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class TimitCorpusReader(CorpusReader): """ Reader for the TIMIT corpus (or any other corpus with the same file layout and use of file formats). The corpus root directory should contain the following files: - timitdic.txt: dictionary of standard transcriptions - spkrinfo.txt: table of speaker information In addition, the root directory should contain one subdirectory for each speaker, containing three files for each utterance: - .txt: text content of utterances - .wrd: tokenized text content of utterances - .phn: phonetic transcription of utterances - .wav: utterance sound file """ _FILE_RE = (r'(\w+-\w+/\w+\.(phn|txt|wav|wrd))|' + r'timitdic\.txt|spkrinfo\.txt') """A regexp matching fileids that are used by this corpus reader.""" _UTTERANCE_RE = r'\w+-\w+/\w+\.txt' def __init__(self, root, encoding='utf8'): """ Construct a new TIMIT corpus reader in the given directory. :param root: The root directory for this corpus. """ # Ensure that wave files don't get treated as unicode data: if isinstance(encoding, compat.string_types): encoding = [('.*\.wav', None), ('.*', encoding)] CorpusReader.__init__(self, root, find_corpus_fileids(root, self._FILE_RE), encoding=encoding) self._utterances = [name[:-4] for name in find_corpus_fileids(root, self._UTTERANCE_RE)] """A list of the utterance identifiers for all utterances in this corpus.""" self._speakerinfo = None self._root = root self.speakers = sorted(set(u.split('/')[0] for u in self._utterances)) def fileids(self, filetype=None): """ Return a list of file identifiers for the files that make up this corpus. :param filetype: If specified, then ``filetype`` indicates that only the files that have the given type should be returned. Accepted values are: ``txt``, ``wrd``, ``phn``, ``wav``, or ``metadata``, """ if filetype is None: return CorpusReader.fileids(self) elif filetype in ('txt', 'wrd', 'phn', 'wav'): return ['%s.%s' % (u, filetype) for u in self._utterances] elif filetype == 'metadata': return ['timitdic.txt', 'spkrinfo.txt'] else: raise ValueError('Bad value for filetype: %r' % filetype) def utteranceids(self, dialect=None, sex=None, spkrid=None, sent_type=None, sentid=None): """ :return: A list of the utterance identifiers for all utterances in this corpus, or for the given speaker, dialect region, gender, sentence type, or sentence number, if specified. """ if isinstance(dialect, compat.string_types): dialect = [dialect] if isinstance(sex, compat.string_types): sex = [sex] if isinstance(spkrid, compat.string_types): spkrid = [spkrid] if isinstance(sent_type, compat.string_types): sent_type = [sent_type] if isinstance(sentid, compat.string_types): sentid = [sentid] utterances = self._utterances[:] if dialect is not None: utterances = [u for u in utterances if u[2] in dialect] if sex is not None: utterances = [u for u in utterances if u[4] in sex] if spkrid is not None: utterances = [u for u in utterances if u[:9] in spkrid] if sent_type is not None: utterances = [u for u in utterances if u[11] in sent_type] if sentid is not None: utterances = [u for u in utterances if u[10:] in spkrid] return utterances def transcription_dict(self): """ :return: A dictionary giving the 'standard' transcription for each word. """ _transcriptions = {} for line in self.open('timitdic.txt'): if not line.strip() or line[0] == ';': continue m = re.match(r'\s*(\S+)\s+/(.*)/\s*$', line) if not m: raise ValueError('Bad line: %r' % line) _transcriptions[m.group(1)] = m.group(2).split() return _transcriptions def spkrid(self, utterance): return utterance.split('/')[0] def sentid(self, utterance): return utterance.split('/')[1] def utterance(self, spkrid, sentid): return '%s/%s' % (spkrid, sentid) def spkrutteranceids(self, speaker): """ :return: A list of all utterances associated with a given speaker. """ return [utterance for utterance in self._utterances if utterance.startswith(speaker+'/')] def spkrinfo(self, speaker): """ :return: A dictionary mapping .. something. """ if speaker in self._utterances: speaker = self.spkrid(speaker) if self._speakerinfo is None: self._speakerinfo = {} for line in self.open('spkrinfo.txt'): if not line.strip() or line[0] == ';': continue rec = line.strip().split(None, 9) key = "dr%s-%s%s" % (rec[2],rec[1].lower(),rec[0].lower()) self._speakerinfo[key] = SpeakerInfo(*rec) return self._speakerinfo[speaker] def phones(self, utterances=None): return [line.split()[-1] for fileid in self._utterance_fileids(utterances, '.phn') for line in self.open(fileid) if line.strip()] def phone_times(self, utterances=None): """ offset is represented as a number of 16kHz samples! """ return [(line.split()[2], int(line.split()[0]), int(line.split()[1])) for fileid in self._utterance_fileids(utterances, '.phn') for line in self.open(fileid) if line.strip()] def words(self, utterances=None): return [line.split()[-1] for fileid in self._utterance_fileids(utterances, '.wrd') for line in self.open(fileid) if line.strip()] def word_times(self, utterances=None): return [(line.split()[2], int(line.split()[0]), int(line.split()[1])) for fileid in self._utterance_fileids(utterances, '.wrd') for line in self.open(fileid) if line.strip()] def sents(self, utterances=None): return [[line.split()[-1] for line in self.open(fileid) if line.strip()] for fileid in self._utterance_fileids(utterances, '.wrd')] def sent_times(self, utterances=None): return [(line.split(None,2)[-1].strip(), int(line.split()[0]), int(line.split()[1])) for fileid in self._utterance_fileids(utterances, '.txt') for line in self.open(fileid) if line.strip()] def phone_trees(self, utterances=None): if utterances is None: utterances = self._utterances if isinstance(utterances, compat.string_types): utterances = [utterances] trees = [] for utterance in utterances: word_times = self.word_times(utterance) phone_times = self.phone_times(utterance) sent_times = self.sent_times(utterance) while sent_times: (sent, sent_start, sent_end) = sent_times.pop(0) trees.append(Tree('S', [])) while (word_times and phone_times and phone_times[0][2] <= word_times[0][1]): trees[-1].append(phone_times.pop(0)[0]) while word_times and word_times[0][2] <= sent_end: (word, word_start, word_end) = word_times.pop(0) trees[-1].append(Tree(word, [])) while phone_times and phone_times[0][2] <= word_end: trees[-1][-1].append(phone_times.pop(0)[0]) while phone_times and phone_times[0][2] <= sent_end: trees[-1].append(phone_times.pop(0)[0]) return trees # [xx] NOTE: This is currently broken -- we're assuming that the # fileids are WAV fileids (aka RIFF), but they're actually NIST SPHERE # fileids. def wav(self, utterance, start=0, end=None): # nltk.chunk conflicts with the stdlib module 'chunk' wave = import_from_stdlib('wave') w = wave.open(self.open(utterance+'.wav'), 'rb') if end is None: end = w.getnframes() # Skip past frames before start, then read the frames we want w.readframes(start) frames = w.readframes(end-start) # Open a new temporary file -- the wave module requires # an actual file, and won't work w/ stringio. :( tf = tempfile.TemporaryFile() out = wave.open(tf, 'w') # Write the parameters & data to the new file. out.setparams(w.getparams()) out.writeframes(frames) out.close() # Read the data back from the file, and return it. The # file will automatically be deleted when we return. tf.seek(0) return tf.read() def audiodata(self, utterance, start=0, end=None): assert(end is None or end > start) headersize = 44 if end is None: data = self.open(utterance+'.wav').read() else: data = self.open(utterance+'.wav').read(headersize+end*2) return data[headersize+start*2:] def _utterance_fileids(self, utterances, extension): if utterances is None: utterances = self._utterances if isinstance(utterances, compat.string_types): utterances = [utterances] return ['%s%s' % (u, extension) for u in utterances] def play(self, utterance, start=0, end=None): """ Play the given audio sample. :param utterance: The utterance id of the sample to play """ # Method 1: os audio dev. try: import ossaudiodev try: dsp = ossaudiodev.open('w') dsp.setfmt(ossaudiodev.AFMT_S16_LE) dsp.channels(1) dsp.speed(16000) dsp.write(self.audiodata(utterance, start, end)) dsp.close() except IOError as e: print(("can't acquire the audio device; please " "activate your audio device."), file=sys.stderr) print("system error message:", str(e), file=sys.stderr) return except ImportError: pass # Method 2: pygame try: # FIXME: this won't work under python 3 import pygame.mixer, StringIO pygame.mixer.init(16000) f = StringIO.StringIO(self.wav(utterance, start, end)) pygame.mixer.Sound(f).play() while pygame.mixer.get_busy(): time.sleep(0.01) return except ImportError: pass # Method 3: complain. :) print(("you must install pygame or ossaudiodev " "for audio playback."), file=sys.stderr) @compat.python_2_unicode_compatible class SpeakerInfo(object): def __init__(self, id, sex, dr, use, recdate, birthdate, ht, race, edu, comments=None): self.id = id self.sex = sex self.dr = dr self.use = use self.recdate = recdate self.birthdate = birthdate self.ht = ht self.race = race self.edu = edu self.comments = comments def __repr__(self): attribs = 'id sex dr use recdate birthdate ht race edu comments' args = ['%s=%r' % (attr, getattr(self, attr)) for attr in attribs.split()] return 'SpeakerInfo(%s)' % (', '.join(args)) def read_timit_block(stream): """ Block reader for timit tagged sentences, which are preceded by a sentence number that will be ignored. """ line = stream.readline() if not line: return [] n, sent = line.split(' ', 1) return [sent] nltk-3.1/nltk/corpus/reader/toolbox.py0000644000076500000240000000414212607224144017644 0ustar sbstaff00000000000000# Natural Language Toolkit: Toolbox Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Greg Aumann # Stuart Robinson # Steven Bird # URL: # For license information, see LICENSE.TXT """ Module for reading, writing and manipulating Toolbox databases and settings fileids. """ import os import re import codecs from nltk import compat from nltk.toolbox import ToolboxData from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class ToolboxCorpusReader(CorpusReader): def xml(self, fileids, key=None): return concat([ToolboxData(path, enc).parse(key=key) for (path, enc) in self.abspaths(fileids, True)]) def fields(self, fileids, strip=True, unwrap=True, encoding='utf8', errors='strict', unicode_fields=None): return concat([list(ToolboxData(fileid,enc).fields( strip, unwrap, encoding, errors, unicode_fields)) for (fileid, enc) in self.abspaths(fileids, include_encoding=True)]) # should probably be done lazily: def entries(self, fileids, **kwargs): if 'key' in kwargs: key = kwargs['key'] del kwargs['key'] else: key = 'lx' # the default key in MDF entries = [] for marker, contents in self.fields(fileids, **kwargs): if marker == key: entries.append((contents, [])) else: try: entries[-1][-1].append((marker, contents)) except IndexError: pass return entries def words(self, fileids, key='lx'): return [contents for marker, contents in self.fields(fileids) if marker == key] def raw(self, fileids): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def demo(): pass if __name__ == '__main__': demo() nltk-3.1/nltk/corpus/reader/twitter.py0000644000076500000240000001127012607224144017660 0ustar sbstaff00000000000000# Natural Language Toolkit: Twitter Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ A reader for corpora that consist of Tweets. It is assumed that the Tweets have been serialised into line-delimited JSON. """ import json import os from nltk import compat from nltk.tokenize import TweetTokenizer from nltk.corpus.reader.util import StreamBackedCorpusView, concat, ZipFilePathPointer from nltk.corpus.reader.api import CorpusReader class TwitterCorpusReader(CorpusReader): """ Reader for corpora that consist of Tweets represented as a list of line-delimited JSON. Individual Tweets can be tokenized using the default tokenizer, or by a custom tokenizer specified as a parameter to the constructor. Construct a new Tweet corpus reader for a set of documents located at the given root directory. If you made your own tweet collection in a directory called `twitter-files`, then you can initialise the reader as:: from nltk.corpus import TwitterCorpusReader reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json') However, the recommended approach is to set the relevant directory as the value of the environmental variable `TWITTER`, and then invoke the reader as follows:: root = os.environ['TWITTER'] reader = TwitterCorpusReader(root, '.*\.json') If you want to work directly with the raw Tweets, the `json` library can be used:: import json for tweet in reader.docs(): print(json.dumps(tweet, indent=1, sort_keys=True)) """ CorpusView = StreamBackedCorpusView """ The corpus view class used by this reader. """ def __init__(self, root, fileids=None, word_tokenizer=TweetTokenizer(), encoding='utf8'): """ :param root: The root directory for this corpus. :param fileids: A list or regexp specifying the fileids in this corpus. :param word_tokenizer: Tokenizer for breaking the text of Tweets into smaller units, including but not limited to words. """ CorpusReader.__init__(self, root, fileids, encoding) for path in self.abspaths(self._fileids): if isinstance(path, ZipFilePathPointer): pass elif os.path.getsize(path) == 0: raise ValueError("File {} is empty".format(path)) """Check that all user-created corpus files are non-empty.""" self._word_tokenizer = word_tokenizer def docs(self, fileids=None): """ Returns the full Tweet objects, as specified by `Twitter documentation on Tweets `_ :return: the given file(s) as a list of dictionaries deserialised from JSON. :rtype: list(dict) """ return concat([self.CorpusView(path, self._read_tweets, encoding=enc) for (path, enc, fileid) in self.abspaths(fileids, True, True)]) def strings(self, fileids=None): """ Returns only the text content of Tweets in the file(s) :return: the given file(s) as a list of Tweets. :rtype: list(str) """ fulltweets = self.docs(fileids) tweets = [] for jsono in fulltweets: try: text = jsono['text'] if isinstance(text, bytes): text = text.decode(self.encoding) tweets.append(text) except KeyError: pass return tweets def tokenized(self, fileids=None): """ :return: the given file(s) as a list of the text content of Tweets as as a list of words, screenanames, hashtags, URLs and punctuation symbols. :rtype: list(list(str)) """ tweets = self.strings(fileids) tokenizer = self._word_tokenizer return [tokenizer.tokenize(t) for t in tweets] def raw(self, fileids=None): """ Return the corpora in their raw form. """ if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) def _read_tweets(self, stream): """ Assumes that each line in ``stream`` is a JSON-serialised object. """ tweets = [] for i in range(10): line = stream.readline() if not line: return tweets tweet = json.loads(line) tweets.append(tweet) return tweets nltk-3.1/nltk/corpus/reader/udhr.py0000644000076500000240000000476012574600335017131 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ UDHR corpus reader. It mostly deals with encodings. """ from __future__ import absolute_import, unicode_literals from nltk.corpus.reader.util import find_corpus_fileids from nltk.corpus.reader.plaintext import PlaintextCorpusReader class UdhrCorpusReader(PlaintextCorpusReader): ENCODINGS = [ ('.*-Latin1$', 'latin-1'), ('.*-Hebrew$', 'hebrew'), ('.*-Arabic$', 'cp1256'), ('Czech_Cesky-UTF8', 'cp1250'), # yeah ('.*-Cyrillic$', 'cyrillic'), ('.*-SJIS$', 'SJIS'), ('.*-GB2312$', 'GB2312'), ('.*-Latin2$', 'ISO-8859-2'), ('.*-Greek$', 'greek'), ('.*-UTF8$', 'utf-8'), ('Hungarian_Magyar-Unicode', 'utf-16-le'), ('Amahuaca', 'latin1'), ('Turkish_Turkce-Turkish', 'latin5'), ('Lithuanian_Lietuviskai-Baltic', 'latin4'), ('Japanese_Nihongo-EUC', 'EUC-JP'), ('Japanese_Nihongo-JIS', 'iso2022_jp'), ('Chinese_Mandarin-HZ', 'hz'), ('Abkhaz\-Cyrillic\+Abkh', 'cp1251'), ] SKIP = set([ # The following files are not fully decodable because they # were truncated at wrong bytes: 'Burmese_Myanmar-UTF8', 'Japanese_Nihongo-JIS', 'Chinese_Mandarin-HZ', 'Chinese_Mandarin-UTF8', 'Gujarati-UTF8', 'Hungarian_Magyar-Unicode', 'Lao-UTF8', 'Magahi-UTF8', 'Marathi-UTF8', 'Tamil-UTF8', # Unfortunately, encodings required for reading # the following files are not supported by Python: 'Vietnamese-VPS', 'Vietnamese-VIQR', 'Vietnamese-TCVN', 'Magahi-Agra', 'Bhojpuri-Agra', 'Esperanto-T61', # latin3 raises an exception # The following files are encoded for specific fonts: 'Burmese_Myanmar-WinResearcher', 'Armenian-DallakHelv', 'Tigrinya_Tigrigna-VG2Main', 'Amharic-Afenegus6..60375', # ? 'Navaho_Dine-Navajo-Navaho-font', # What are these? 'Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117', 'Azeri_Azerbaijani_Latin-Az.Times.Lat0117', # The following files are unintended: 'Czech-Latin2-err', 'Russian_Russky-UTF8~', ]) def __init__(self, root='udhr'): fileids = find_corpus_fileids(root, r'(?!README|\.).*') super(UdhrCorpusReader, self).__init__( root, [fileid for fileid in fileids if fileid not in self.SKIP], encoding=self.ENCODINGS ) nltk-3.1/nltk/corpus/reader/util.py0000644000076500000240000007377212607224144017152 0ustar sbstaff00000000000000# Natural Language Toolkit: Corpus Reader Utilities # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT import os import bisect import re import tempfile from functools import reduce try: import cPickle as pickle except ImportError: import pickle # Use the c version of ElementTree, which is faster, if possible: try: from xml.etree import cElementTree as ElementTree except ImportError: from xml.etree import ElementTree from nltk.compat import string_types, text_type from nltk.tokenize import wordpunct_tokenize from nltk.internals import slice_bounds from nltk.data import PathPointer, FileSystemPathPointer, ZipFilePathPointer from nltk.data import SeekableUnicodeStreamReader from nltk.util import AbstractLazySequence, LazySubsequence, LazyConcatenation, py25 ###################################################################### #{ Corpus View ###################################################################### class StreamBackedCorpusView(AbstractLazySequence): """ A 'view' of a corpus file, which acts like a sequence of tokens: it can be accessed by index, iterated over, etc. However, the tokens are only constructed as-needed -- the entire corpus is never stored in memory at once. The constructor to ``StreamBackedCorpusView`` takes two arguments: a corpus fileid (specified as a string or as a ``PathPointer``); and a block reader. A "block reader" is a function that reads zero or more tokens from a stream, and returns them as a list. A very simple example of a block reader is: >>> def simple_block_reader(stream): ... return stream.readline().split() This simple block reader reads a single line at a time, and returns a single token (consisting of a string) for each whitespace-separated substring on the line. When deciding how to define the block reader for a given corpus, careful consideration should be given to the size of blocks handled by the block reader. Smaller block sizes will increase the memory requirements of the corpus view's internal data structures (by 2 integers per block). On the other hand, larger block sizes may decrease performance for random access to the corpus. (But note that larger block sizes will *not* decrease performance for iteration.) Internally, ``CorpusView`` maintains a partial mapping from token index to file position, with one entry per block. When a token with a given index *i* is requested, the ``CorpusView`` constructs it as follows: 1. First, it searches the toknum/filepos mapping for the token index closest to (but less than or equal to) *i*. 2. Then, starting at the file position corresponding to that index, it reads one block at a time using the block reader until it reaches the requested token. The toknum/filepos mapping is created lazily: it is initially empty, but every time a new block is read, the block's initial token is added to the mapping. (Thus, the toknum/filepos map has one entry per block.) In order to increase efficiency for random access patterns that have high degrees of locality, the corpus view may cache one or more blocks. :note: Each ``CorpusView`` object internally maintains an open file object for its underlying corpus file. This file should be automatically closed when the ``CorpusView`` is garbage collected, but if you wish to close it manually, use the ``close()`` method. If you access a ``CorpusView``'s items after it has been closed, the file object will be automatically re-opened. :warning: If the contents of the file are modified during the lifetime of the ``CorpusView``, then the ``CorpusView``'s behavior is undefined. :warning: If a unicode encoding is specified when constructing a ``CorpusView``, then the block reader may only call ``stream.seek()`` with offsets that have been returned by ``stream.tell()``; in particular, calling ``stream.seek()`` with relative offsets, or with offsets based on string lengths, may lead to incorrect behavior. :ivar _block_reader: The function used to read a single block from the underlying file stream. :ivar _toknum: A list containing the token index of each block that has been processed. In particular, ``_toknum[i]`` is the token index of the first token in block ``i``. Together with ``_filepos``, this forms a partial mapping between token indices and file positions. :ivar _filepos: A list containing the file position of each block that has been processed. In particular, ``_toknum[i]`` is the file position of the first character in block ``i``. Together with ``_toknum``, this forms a partial mapping between token indices and file positions. :ivar _stream: The stream used to access the underlying corpus file. :ivar _len: The total number of tokens in the corpus, if known; or None, if the number of tokens is not yet known. :ivar _eofpos: The character position of the last character in the file. This is calculated when the corpus view is initialized, and is used to decide when the end of file has been reached. :ivar _cache: A cache of the most recently read block. It is encoded as a tuple (start_toknum, end_toknum, tokens), where start_toknum is the token index of the first token in the block; end_toknum is the token index of the first token not in the block; and tokens is a list of the tokens in the block. """ def __init__(self, fileid, block_reader=None, startpos=0, encoding='utf8'): """ Create a new corpus view, based on the file ``fileid``, and read with ``block_reader``. See the class documentation for more information. :param fileid: The path to the file that is read by this corpus view. ``fileid`` can either be a string or a ``PathPointer``. :param startpos: The file position at which the view will start reading. This can be used to skip over preface sections. :param encoding: The unicode encoding that should be used to read the file's contents. If no encoding is specified, then the file's contents will be read as a non-unicode string (i.e., a str). """ if block_reader: self.read_block = block_reader # Initialize our toknum/filepos mapping. self._toknum = [0] self._filepos = [startpos] self._encoding = encoding # We don't know our length (number of tokens) yet. self._len = None self._fileid = fileid self._stream = None self._current_toknum = None """This variable is set to the index of the next token that will be read, immediately before ``self.read_block()`` is called. This is provided for the benefit of the block reader, which under rare circumstances may need to know the current token number.""" self._current_blocknum = None """This variable is set to the index of the next block that will be read, immediately before ``self.read_block()`` is called. This is provided for the benefit of the block reader, which under rare circumstances may need to know the current block number.""" # Find the length of the file. try: if isinstance(self._fileid, PathPointer): self._eofpos = self._fileid.file_size() else: self._eofpos = os.stat(self._fileid).st_size except Exception as exc: raise ValueError('Unable to open or access %r -- %s' % (fileid, exc)) # Maintain a cache of the most recently read block, to # increase efficiency of random access. self._cache = (-1, -1, None) fileid = property(lambda self: self._fileid, doc=""" The fileid of the file that is accessed by this view. :type: str or PathPointer""") def read_block(self, stream): """ Read a block from the input stream. :return: a block of tokens from the input stream :rtype: list(any) :param stream: an input stream :type stream: stream """ raise NotImplementedError('Abstract Method') def _open(self): """ Open the file stream associated with this corpus view. This will be called performed if any value is read from the view while its file stream is closed. """ if isinstance(self._fileid, PathPointer): self._stream = self._fileid.open(self._encoding) elif self._encoding: self._stream = SeekableUnicodeStreamReader( open(self._fileid, 'rb'), self._encoding) else: self._stream = open(self._fileid, 'rb') def close(self): """ Close the file stream associated with this corpus view. This can be useful if you are worried about running out of file handles (although the stream should automatically be closed upon garbage collection of the corpus view). If the corpus view is accessed after it is closed, it will be automatically re-opened. """ if self._stream is not None: self._stream.close() self._stream = None def __len__(self): if self._len is None: # iterate_from() sets self._len when it reaches the end # of the file: for tok in self.iterate_from(self._toknum[-1]): pass return self._len def __getitem__(self, i): if isinstance(i, slice): start, stop = slice_bounds(self, i) # Check if it's in the cache. offset = self._cache[0] if offset <= start and stop <= self._cache[1]: return self._cache[2][start-offset:stop-offset] # Construct & return the result. return LazySubsequence(self, start, stop) else: # Handle negative indices if i < 0: i += len(self) if i < 0: raise IndexError('index out of range') # Check if it's in the cache. offset = self._cache[0] if offset <= i < self._cache[1]: return self._cache[2][i-offset] # Use iterate_from to extract it. try: return next(self.iterate_from(i)) except StopIteration: raise IndexError('index out of range') # If we wanted to be thread-safe, then this method would need to # do some locking. def iterate_from(self, start_tok): # Start by feeding from the cache, if possible. if self._cache[0] <= start_tok < self._cache[1]: for tok in self._cache[2][start_tok-self._cache[0]:]: yield tok start_tok += 1 # Decide where in the file we should start. If `start` is in # our mapping, then we can jump straight to the correct block; # otherwise, start at the last block we've processed. if start_tok < self._toknum[-1]: block_index = bisect.bisect_right(self._toknum, start_tok)-1 toknum = self._toknum[block_index] filepos = self._filepos[block_index] else: block_index = len(self._toknum)-1 toknum = self._toknum[-1] filepos = self._filepos[-1] # Open the stream, if it's not open already. if self._stream is None: self._open() # Each iteration through this loop, we read a single block # from the stream. while filepos < self._eofpos: # Read the next block. self._stream.seek(filepos) self._current_toknum = toknum self._current_blocknum = block_index tokens = self.read_block(self._stream) assert isinstance(tokens, (tuple, list, AbstractLazySequence)), ( 'block reader %s() should return list or tuple.' % self.read_block.__name__) num_toks = len(tokens) new_filepos = self._stream.tell() assert new_filepos > filepos, ( 'block reader %s() should consume at least 1 byte (filepos=%d)' % (self.read_block.__name__, filepos)) # Update our cache. self._cache = (toknum, toknum+num_toks, list(tokens)) # Update our mapping. assert toknum <= self._toknum[-1] if num_toks > 0: block_index += 1 if toknum == self._toknum[-1]: assert new_filepos > self._filepos[-1] # monotonic! self._filepos.append(new_filepos) self._toknum.append(toknum+num_toks) else: # Check for consistency: assert new_filepos == self._filepos[block_index], ( 'inconsistent block reader (num chars read)') assert toknum+num_toks == self._toknum[block_index], ( 'inconsistent block reader (num tokens returned)') # If we reached the end of the file, then update self._len if new_filepos == self._eofpos: self._len = toknum + num_toks # Generate the tokens in this block (but skip any tokens # before start_tok). Note that between yields, our state # may be modified. for tok in tokens[max(0, start_tok-toknum):]: yield tok # If we're at the end of the file, then we're done. assert new_filepos <= self._eofpos if new_filepos == self._eofpos: break # Update our indices toknum += num_toks filepos = new_filepos # If we reach this point, then we should know our length. assert self._len is not None # Use concat for these, so we can use a ConcatenatedCorpusView # when possible. def __add__(self, other): return concat([self, other]) def __radd__(self, other): return concat([other, self]) def __mul__(self, count): return concat([self] * count) def __rmul__(self, count): return concat([self] * count) class ConcatenatedCorpusView(AbstractLazySequence): """ A 'view' of a corpus file that joins together one or more ``StreamBackedCorpusViews``. At most one file handle is left open at any time. """ def __init__(self, corpus_views): self._pieces = corpus_views """A list of the corpus subviews that make up this concatenation.""" self._offsets = [0] """A list of offsets, indicating the index at which each subview begins. In particular:: offsets[i] = sum([len(p) for p in pieces[:i]])""" self._open_piece = None """The most recently accessed corpus subview (or None). Before a new subview is accessed, this subview will be closed.""" def __len__(self): if len(self._offsets) <= len(self._pieces): # Iterate to the end of the corpus. for tok in self.iterate_from(self._offsets[-1]): pass return self._offsets[-1] def close(self): for piece in self._pieces: piece.close() def iterate_from(self, start_tok): piecenum = bisect.bisect_right(self._offsets, start_tok)-1 while piecenum < len(self._pieces): offset = self._offsets[piecenum] piece = self._pieces[piecenum] # If we've got another piece open, close it first. if self._open_piece is not piece: if self._open_piece is not None: self._open_piece.close() self._open_piece = piece # Get everything we can from this piece. for tok in piece.iterate_from(max(0, start_tok-offset)): yield tok # Update the offset table. if piecenum+1 == len(self._offsets): self._offsets.append(self._offsets[-1] + len(piece)) # Move on to the next piece. piecenum += 1 def concat(docs): """ Concatenate together the contents of multiple documents from a single corpus, using an appropriate concatenation function. This utility function is used by corpus readers when the user requests more than one document at a time. """ if len(docs) == 1: return docs[0] if len(docs) == 0: raise ValueError('concat() expects at least one object!') types = set(d.__class__ for d in docs) # If they're all strings, use string concatenation. if all(isinstance(doc, string_types) for doc in docs): return ''.join(docs) # If they're all corpus views, then use ConcatenatedCorpusView. for typ in types: if not issubclass(typ, (StreamBackedCorpusView, ConcatenatedCorpusView)): break else: return ConcatenatedCorpusView(docs) # If they're all lazy sequences, use a lazy concatenation for typ in types: if not issubclass(typ, AbstractLazySequence): break else: return LazyConcatenation(docs) # Otherwise, see what we can do: if len(types) == 1: typ = list(types)[0] if issubclass(typ, list): return reduce((lambda a,b:a+b), docs, []) if issubclass(typ, tuple): return reduce((lambda a,b:a+b), docs, ()) if ElementTree.iselement(typ): xmltree = ElementTree.Element('documents') for doc in docs: xmltree.append(doc) return xmltree # No method found! raise ValueError("Don't know how to concatenate types: %r" % types) ###################################################################### #{ Corpus View for Pickled Sequences ###################################################################### class PickleCorpusView(StreamBackedCorpusView): """ A stream backed corpus view for corpus files that consist of sequences of serialized Python objects (serialized using ``pickle.dump``). One use case for this class is to store the result of running feature detection on a corpus to disk. This can be useful when performing feature detection is expensive (so we don't want to repeat it); but the corpus is too large to store in memory. The following example illustrates this technique: >>> from nltk.corpus.reader.util import PickleCorpusView >>> from nltk.util import LazyMap >>> feature_corpus = LazyMap(detect_features, corpus) # doctest: +SKIP >>> PickleCorpusView.write(feature_corpus, some_fileid) # doctest: +SKIP >>> pcv = PickleCorpusView(some_fileid) # doctest: +SKIP """ BLOCK_SIZE = 100 PROTOCOL = -1 def __init__(self, fileid, delete_on_gc=False): """ Create a new corpus view that reads the pickle corpus ``fileid``. :param delete_on_gc: If true, then ``fileid`` will be deleted whenever this object gets garbage-collected. """ self._delete_on_gc = delete_on_gc StreamBackedCorpusView.__init__(self, fileid) def read_block(self, stream): result = [] for i in range(self.BLOCK_SIZE): try: result.append(pickle.load(stream)) except EOFError: break return result def __del__(self): """ If ``delete_on_gc`` was set to true when this ``PickleCorpusView`` was created, then delete the corpus view's fileid. (This method is called whenever a ``PickledCorpusView`` is garbage-collected. """ if getattr(self, '_delete_on_gc'): if os.path.exists(self._fileid): try: os.remove(self._fileid) except (OSError, IOError): pass self.__dict__.clear() # make the garbage collector's job easier @classmethod def write(cls, sequence, output_file): if isinstance(output_file, string_types): output_file = open(output_file, 'wb') for item in sequence: pickle.dump(item, output_file, cls.PROTOCOL) @classmethod def cache_to_tempfile(cls, sequence, delete_on_gc=True): """ Write the given sequence to a temporary file as a pickle corpus; and then return a ``PickleCorpusView`` view for that temporary corpus file. :param delete_on_gc: If true, then the temporary file will be deleted whenever this object gets garbage-collected. """ try: fd, output_file_name = tempfile.mkstemp('.pcv', 'nltk-') output_file = os.fdopen(fd, 'wb') cls.write(sequence, output_file) output_file.close() return PickleCorpusView(output_file_name, delete_on_gc) except (OSError, IOError) as e: raise ValueError('Error while creating temp file: %s' % e) ###################################################################### #{ Block Readers ###################################################################### def read_whitespace_block(stream): toks = [] for i in range(20): # Read 20 lines at a time. toks.extend(stream.readline().split()) return toks def read_wordpunct_block(stream): toks = [] for i in range(20): # Read 20 lines at a time. toks.extend(wordpunct_tokenize(stream.readline())) return toks def read_line_block(stream): toks = [] for i in range(20): line = stream.readline() if not line: return toks toks.append(line.rstrip('\n')) return toks def read_blankline_block(stream): s = '' while True: line = stream.readline() # End of file: if not line: if s: return [s] else: return [] # Blank line: elif line and not line.strip(): if s: return [s] # Other line: else: s += line def read_alignedsent_block(stream): s = '' while True: line = stream.readline() if line[0] == '=' or line[0] == '\n' or line[:2] == '\r\n': continue # End of file: if not line: if s: return [s] else: return [] # Other line: else: s += line if re.match('^\d+-\d+', line) is not None: return [s] def read_regexp_block(stream, start_re, end_re=None): """ Read a sequence of tokens from a stream, where tokens begin with lines that match ``start_re``. If ``end_re`` is specified, then tokens end with lines that match ``end_re``; otherwise, tokens end whenever the next line matching ``start_re`` or EOF is found. """ # Scan until we find a line matching the start regexp. while True: line = stream.readline() if not line: return [] # end of file. if re.match(start_re, line): break # Scan until we find another line matching the regexp, or EOF. lines = [line] while True: oldpos = stream.tell() line = stream.readline() # End of file: if not line: return [''.join(lines)] # End of token: if end_re is not None and re.match(end_re, line): return [''.join(lines)] # Start of new token: backup to just before it starts, and # return the token we've already collected. if end_re is None and re.match(start_re, line): stream.seek(oldpos) return [''.join(lines)] # Anything else is part of the token. lines.append(line) def read_sexpr_block(stream, block_size=16384, comment_char=None): """ Read a sequence of s-expressions from the stream, and leave the stream's file position at the end the last complete s-expression read. This function will always return at least one s-expression, unless there are no more s-expressions in the file. If the file ends in in the middle of an s-expression, then that incomplete s-expression is returned when the end of the file is reached. :param block_size: The default block size for reading. If an s-expression is longer than one block, then more than one block will be read. :param comment_char: A character that marks comments. Any lines that begin with this character will be stripped out. (If spaces or tabs precede the comment character, then the line will not be stripped.) """ start = stream.tell() block = stream.read(block_size) encoding = getattr(stream, 'encoding', None) assert encoding is not None or isinstance(block, text_type) if encoding not in (None, 'utf-8'): import warnings warnings.warn('Parsing may fail, depending on the properties ' 'of the %s encoding!' % encoding) # (e.g., the utf-16 encoding does not work because it insists # on adding BOMs to the beginning of encoded strings.) if comment_char: COMMENT = re.compile('(?m)^%s.*$' % re.escape(comment_char)) while True: try: # If we're stripping comments, then make sure our block ends # on a line boundary; and then replace any comments with # space characters. (We can't just strip them out -- that # would make our offset wrong.) if comment_char: block += stream.readline() block = re.sub(COMMENT, _sub_space, block) # Read the block. tokens, offset = _parse_sexpr_block(block) # Skip whitespace offset = re.compile(r'\s*').search(block, offset).end() # Move to the end position. if encoding is None: stream.seek(start+offset) else: stream.seek(start+len(block[:offset].encode(encoding))) # Return the list of tokens we processed return tokens except ValueError as e: if e.args[0] == 'Block too small': next_block = stream.read(block_size) if next_block: block += next_block continue else: # The file ended mid-sexpr -- return what we got. return [block.strip()] else: raise def _sub_space(m): """Helper function: given a regexp match, return a string of spaces that's the same length as the matched string.""" return ' '*(m.end()-m.start()) def _parse_sexpr_block(block): tokens = [] start = end = 0 while end < len(block): m = re.compile(r'\S').search(block, end) if not m: return tokens, end start = m.start() # Case 1: sexpr is not parenthesized. if m.group() != '(': m2 = re.compile(r'[\s(]').search(block, start) if m2: end = m2.start() else: if tokens: return tokens, end raise ValueError('Block too small') # Case 2: parenthesized sexpr. else: nesting = 0 for m in re.compile(r'[()]').finditer(block, start): if m.group()=='(': nesting += 1 else: nesting -= 1 if nesting == 0: end = m.end() break else: if tokens: return tokens, end raise ValueError('Block too small') tokens.append(block[start:end]) return tokens, end ###################################################################### #{ Finding Corpus Items ###################################################################### def find_corpus_fileids(root, regexp): if not isinstance(root, PathPointer): raise TypeError('find_corpus_fileids: expected a PathPointer') regexp += '$' # Find fileids in a zipfile: scan the zipfile's namelist. Filter # out entries that end in '/' -- they're directories. if isinstance(root, ZipFilePathPointer): fileids = [name[len(root.entry):] for name in root.zipfile.namelist() if not name.endswith('/')] items = [name for name in fileids if re.match(regexp, name)] return sorted(items) # Find fileids in a directory: use os.walk to search all (proper # or symlinked) subdirectories, and match paths against the regexp. elif isinstance(root, FileSystemPathPointer): items = [] # workaround for py25 which doesn't support followlinks kwargs = {} if not py25(): kwargs = {'followlinks': True} for dirname, subdirs, fileids in os.walk(root.path, **kwargs): prefix = ''.join('%s/' % p for p in _path_from(root.path, dirname)) items += [prefix+fileid for fileid in fileids if re.match(regexp, prefix+fileid)] # Don't visit svn directories: if '.svn' in subdirs: subdirs.remove('.svn') return sorted(items) else: raise AssertionError("Don't know how to handle %r" % root) def _path_from(parent, child): if os.path.split(parent)[1] == '': parent = os.path.split(parent)[0] path = [] while parent != child: child, dirname = os.path.split(child) path.insert(0, dirname) assert os.path.split(child)[0] != child return path ###################################################################### #{ Paragraph structure in Treebank files ###################################################################### def tagged_treebank_para_block_reader(stream): # Read the next paragraph. para = '' while True: line = stream.readline() # End of paragraph: if re.match('======+\s*$', line): if para.strip(): return [para] # End of file: elif line == '': if para.strip(): return [para] else: return [] # Content line: else: para += line nltk-3.1/nltk/corpus/reader/verbnet.py0000644000076500000240000004033112607224144017623 0ustar sbstaff00000000000000# Natural Language Toolkit: Verbnet Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ An NLTK interface to the VerbNet verb lexicon For details about VerbNet see: http://verbs.colorado.edu/~mpalmer/projects/verbnet.html """ from __future__ import unicode_literals import re import textwrap from collections import defaultdict from nltk import compat from nltk.corpus.reader.xmldocs import XMLCorpusReader class VerbnetCorpusReader(XMLCorpusReader): """ An NLTK interface to the VerbNet verb lexicon. From the VerbNet site: "VerbNet (VN) (Kipper-Schuler 2006) is the largest on-line verb lexicon currently available for English. It is a hierarchical domain-independent, broad-coverage verb lexicon with mappings to other lexical resources such as WordNet (Miller, 1990; Fellbaum, 1998), Xtag (XTAG Research Group, 2001), and FrameNet (Baker et al., 1998)." For details about VerbNet see: http://verbs.colorado.edu/~mpalmer/projects/verbnet.html """ # No unicode encoding param, since the data files are all XML. def __init__(self, root, fileids, wrap_etree=False): XMLCorpusReader.__init__(self, root, fileids, wrap_etree) self._lemma_to_class = defaultdict(list) """A dictionary mapping from verb lemma strings to lists of verbnet class identifiers.""" self._wordnet_to_class = defaultdict(list) """A dictionary mapping from wordnet identifier strings to lists of verbnet class identifiers.""" self._class_to_fileid = {} """A dictionary mapping from class identifiers to corresponding file identifiers. The keys of this dictionary provide a complete list of all classes and subclasses.""" self._shortid_to_longid = {} # Initialize the dictionaries. Use the quick (regexp-based) # method instead of the slow (xml-based) method, because it # runs 2-30 times faster. self._quick_index() _LONGID_RE = re.compile(r'([^\-\.]*)-([\d+.\-]+)$') """Regular expression that matches (and decomposes) longids""" _SHORTID_RE = re.compile(r'[\d+.\-]+$') """Regular expression that matches shortids""" _INDEX_RE = re.compile(r']+>|' r'') """Regular expression used by ``_index()`` to quickly scan the corpus for basic information.""" def lemmas(self, classid=None): """ Return a list of all verb lemmas that appear in any class, or in the ``classid`` if specified. """ if classid is None: return sorted(self._lemma_to_class.keys()) else: # [xx] should this include subclass members? vnclass = self.vnclass(classid) return [member.get('name') for member in vnclass.findall('MEMBERS/MEMBER')] def wordnetids(self, classid=None): """ Return a list of all wordnet identifiers that appear in any class, or in ``classid`` if specified. """ if classid is None: return sorted(self._wordnet_to_class.keys()) else: # [xx] should this include subclass members? vnclass = self.vnclass(classid) return sum([member.get('wn','').split() for member in vnclass.findall('MEMBERS/MEMBER')], []) def classids(self, lemma=None, wordnetid=None, fileid=None, classid=None): """ Return a list of the verbnet class identifiers. If a file identifier is specified, then return only the verbnet class identifiers for classes (and subclasses) defined by that file. If a lemma is specified, then return only verbnet class identifiers for classes that contain that lemma as a member. If a wordnetid is specified, then return only identifiers for classes that contain that wordnetid as a member. If a classid is specified, then return only identifiers for subclasses of the specified verbnet class. """ if len([x for x in [lemma, wordnetid, fileid, classid] if x is not None]) > 1: raise ValueError('Specify at most one of: fileid, wordnetid, ' 'fileid, classid') if fileid is not None: return [c for (c,f) in self._class_to_fileid.items() if f == fileid] elif lemma is not None: return self._lemma_to_class[lemma] elif wordnetid is not None: return self._wordnet_to_class[wordnetid] elif classid is not None: xmltree = self.vnclass(classid) return [subclass.get('ID') for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS')] else: return sorted(self._class_to_fileid.keys()) def vnclass(self, fileid_or_classid): """ Return an ElementTree containing the xml for the specified verbnet class. :param fileid_or_classid: An identifier specifying which class should be returned. Can be a file identifier (such as ``'put-9.1.xml'``), or a verbnet class identifier (such as ``'put-9.1'``) or a short verbnet class identifier (such as ``'9.1'``). """ # File identifier: just return the xml. if fileid_or_classid in self._fileids: return self.xml(fileid_or_classid) # Class identifier: get the xml, and find the right elt. classid = self.longid(fileid_or_classid) if classid in self._class_to_fileid: fileid = self._class_to_fileid[self.longid(classid)] tree = self.xml(fileid) if classid == tree.get('ID'): return tree else: for subclass in tree.findall('.//VNSUBCLASS'): if classid == subclass.get('ID'): return subclass else: assert False # we saw it during _index()! else: raise ValueError('Unknown identifier %s' % fileid_or_classid) def fileids(self, vnclass_ids=None): """ Return a list of fileids that make up this corpus. If ``vnclass_ids`` is specified, then return the fileids that make up the specified verbnet class(es). """ if vnclass_ids is None: return self._fileids elif isinstance(vnclass_ids, compat.string_types): return [self._class_to_fileid[self.longid(vnclass_ids)]] else: return [self._class_to_fileid[self.longid(vnclass_id)] for vnclass_id in vnclass_ids] ###################################################################### #{ Index Initialization ###################################################################### def _index(self): """ Initialize the indexes ``_lemma_to_class``, ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning through the corpus fileids. This is fast with cElementTree (<0.1 secs), but quite slow (>10 secs) with the python implementation of ElementTree. """ for fileid in self._fileids: self._index_helper(self.xml(fileid), fileid) def _index_helper(self, xmltree, fileid): """Helper for ``_index()``""" vnclass = xmltree.get('ID') self._class_to_fileid[vnclass] = fileid self._shortid_to_longid[self.shortid(vnclass)] = vnclass for member in xmltree.findall('MEMBERS/MEMBER'): self._lemma_to_class[member.get('name')].append(vnclass) for wn in member.get('wn', '').split(): self._wordnet_to_class[wn].append(vnclass) for subclass in xmltree.findall('SUBCLASSES/VNSUBCLASS'): self._index_helper(subclass, fileid) def _quick_index(self): """ Initialize the indexes ``_lemma_to_class``, ``_wordnet_to_class``, and ``_class_to_fileid`` by scanning through the corpus fileids. This doesn't do proper xml parsing, but is good enough to find everything in the standard verbnet corpus -- and it runs about 30 times faster than xml parsing (with the python ElementTree; only 2-3 times faster with cElementTree). """ # nb: if we got rid of wordnet_to_class, this would run 2-3 # times faster. for fileid in self._fileids: vnclass = fileid[:-4] # strip the '.xml' self._class_to_fileid[vnclass] = fileid self._shortid_to_longid[self.shortid(vnclass)] = vnclass for m in self._INDEX_RE.finditer(self.open(fileid).read()): groups = m.groups() if groups[0] is not None: self._lemma_to_class[groups[0]].append(vnclass) for wn in groups[1].split(): self._wordnet_to_class[wn].append(vnclass) elif groups[2] is not None: self._class_to_fileid[groups[2]] = fileid vnclass = groups[2] # for elts. self._shortid_to_longid[self.shortid(vnclass)] = vnclass else: assert False, 'unexpected match condition' ###################################################################### #{ Identifier conversion ###################################################################### def longid(self, shortid): """Given a short verbnet class identifier (eg '37.10'), map it to a long id (eg 'confess-37.10'). If ``shortid`` is already a long id, then return it as-is""" if self._LONGID_RE.match(shortid): return shortid # it's already a longid. elif not self._SHORTID_RE.match(shortid): raise ValueError('vnclass identifier %r not found' % shortid) try: return self._shortid_to_longid[shortid] except KeyError: raise ValueError('vnclass identifier %r not found' % shortid) def shortid(self, longid): """Given a long verbnet class identifier (eg 'confess-37.10'), map it to a short id (eg '37.10'). If ``longid`` is already a short id, then return it as-is.""" if self._SHORTID_RE.match(longid): return longid # it's already a shortid. m = self._LONGID_RE.match(longid) if m: return m.group(2) else: raise ValueError('vnclass identifier %r not found' % longid) ###################################################################### #{ Pretty Printing ###################################################################### def pprint(self, vnclass): """ Return a string containing a pretty-printed representation of the given verbnet class. :param vnclass: A verbnet class identifier; or an ElementTree containing the xml contents of a verbnet class. """ if isinstance(vnclass, compat.string_types): vnclass = self.vnclass(vnclass) s = vnclass.get('ID') + '\n' s += self.pprint_subclasses(vnclass, indent=' ') + '\n' s += self.pprint_members(vnclass, indent=' ') + '\n' s += ' Thematic roles:\n' s += self.pprint_themroles(vnclass, indent=' ') + '\n' s += ' Frames:\n' s += '\n'.join(self.pprint_frame(vnframe, indent=' ') for vnframe in vnclass.findall('FRAMES/FRAME')) return s def pprint_subclasses(self, vnclass, indent=''): """ Return a string containing a pretty-printed representation of the given verbnet class's subclasses. :param vnclass: A verbnet class identifier; or an ElementTree containing the xml contents of a verbnet class. """ if isinstance(vnclass, compat.string_types): vnclass = self.vnclass(vnclass) subclasses = [subclass.get('ID') for subclass in vnclass.findall('SUBCLASSES/VNSUBCLASS')] if not subclasses: subclasses = ['(none)'] s = 'Subclasses: ' + ' '.join(subclasses) return textwrap.fill(s, 70, initial_indent=indent, subsequent_indent=indent+' ') def pprint_members(self, vnclass, indent=''): """ Return a string containing a pretty-printed representation of the given verbnet class's member verbs. :param vnclass: A verbnet class identifier; or an ElementTree containing the xml contents of a verbnet class. """ if isinstance(vnclass, compat.string_types): vnclass = self.vnclass(vnclass) members = [member.get('name') for member in vnclass.findall('MEMBERS/MEMBER')] if not members: members = ['(none)'] s = 'Members: ' + ' '.join(members) return textwrap.fill(s, 70, initial_indent=indent, subsequent_indent=indent+' ') def pprint_themroles(self, vnclass, indent=''): """ Return a string containing a pretty-printed representation of the given verbnet class's thematic roles. :param vnclass: A verbnet class identifier; or an ElementTree containing the xml contents of a verbnet class. """ if isinstance(vnclass, compat.string_types): vnclass = self.vnclass(vnclass) pieces = [] for themrole in vnclass.findall('THEMROLES/THEMROLE'): piece = indent + '* ' + themrole.get('type') modifiers = ['%(Value)s%(type)s' % restr.attrib for restr in themrole.findall('SELRESTRS/SELRESTR')] if modifiers: piece += '[%s]' % ' '.join(modifiers) pieces.append(piece) return '\n'.join(pieces) def pprint_frame(self, vnframe, indent=''): """ Return a string containing a pretty-printed representation of the given verbnet frame. :param vnframe: An ElementTree containing the xml contents of a verbnet frame. """ s = self.pprint_description(vnframe, indent) + '\n' s += self.pprint_syntax(vnframe, indent+' Syntax: ') + '\n' s += indent + ' Semantics:\n' s += self.pprint_semantics(vnframe, indent+' ') return s def pprint_description(self, vnframe, indent=''): """ Return a string containing a pretty-printed representation of the given verbnet frame description. :param vnframe: An ElementTree containing the xml contents of a verbnet frame. """ descr = vnframe.find('DESCRIPTION') s = indent + descr.attrib['primary'] if descr.get('secondary', ''): s += ' (%s)' % descr.get('secondary') return s def pprint_syntax(self, vnframe, indent=''): """ Return a string containing a pretty-printed representation of the given verbnet frame syntax. :param vnframe: An ElementTree containing the xml contents of a verbnet frame. """ pieces = [] for elt in vnframe.find('SYNTAX'): piece = elt.tag modifiers = [] if 'value' in elt.attrib: modifiers.append(elt.get('value')) modifiers += ['%(Value)s%(type)s' % restr.attrib for restr in (elt.findall('SELRESTRS/SELRESTR') + elt.findall('SYNRESTRS/SYNRESTR'))] if modifiers: piece += '[%s]' % ' '.join(modifiers) pieces.append(piece) return indent + ' '.join(pieces) def pprint_semantics(self, vnframe, indent=''): """ Return a string containing a pretty-printed representation of the given verbnet frame semantics. :param vnframe: An ElementTree containing the xml contents of a verbnet frame. """ pieces = [] for pred in vnframe.findall('SEMANTICS/PRED'): args = [arg.get('value') for arg in pred.findall('ARGS/ARG')] pieces.append('%s(%s)' % (pred.get('value'), ', '.join(args))) return '\n'.join('%s* %s' % (indent, piece) for piece in pieces) nltk-3.1/nltk/corpus/reader/wordlist.py0000644000076500000240000000216512607224144020030 0ustar sbstaff00000000000000# Natural Language Toolkit: Word List Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT from nltk import compat from nltk.tokenize import line_tokenize from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class WordListCorpusReader(CorpusReader): """ List of words, one per line. Blank lines are ignored. """ def words(self, fileids=None): return line_tokenize(self.raw(fileids)) def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) class SwadeshCorpusReader(WordListCorpusReader): def entries(self, fileids=None): """ :return: a tuple of words for the specified fileids. """ if not fileids: fileids = self.fileids() wordlists = [self.words(f) for f in fileids] return list(zip(*wordlists)) nltk-3.1/nltk/corpus/reader/wordnet.py0000644000076500000240000022560512607247337017662 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: WordNet # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bethard # Steven Bird # Edward Loper # Nitin Madnani # Nasruddin A’aidil Shari # Sim Wei Ying Geraldine # Soe Lynn # Francis Bond # URL: # For license information, see LICENSE.TXT """ An NLTK interface for WordNet WordNet is a lexical database of English. Using synsets, helps find conceptual relationships between words such as hypernyms, hyponyms, synonyms, antonyms etc. For details about WordNet see: http://wordnet.princeton.edu/ This module also allows you to find lemmas in languages other than English from the Open Multilingual Wordnet http://compling.hss.ntu.edu.sg/omw/ """ from __future__ import print_function, unicode_literals import math import re from itertools import islice, chain from operator import itemgetter, attrgetter from collections import defaultdict, deque from nltk.corpus.reader import CorpusReader from nltk.util import binary_search_file as _binary_search_file from nltk.probability import FreqDist from nltk.compat import (iteritems, python_2_unicode_compatible, total_ordering, xrange) ###################################################################### ## Table of Contents ###################################################################### ## - Constants ## - Data Classes ## - WordNetError ## - Lemma ## - Synset ## - WordNet Corpus Reader ## - WordNet Information Content Corpus Reader ## - Similarity Metrics ## - Demo ###################################################################### ## Constants ###################################################################### #: Positive infinity (for similarity functions) _INF = 1e300 #{ Part-of-speech constants ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v' #} POS_LIST = [NOUN, VERB, ADJ, ADV] #: A table of strings that are used to express verb frames. VERB_FRAME_STRINGS = ( None, "Something %s", "Somebody %s", "It is %sing", "Something is %sing PP", "Something %s something Adjective/Noun", "Something %s Adjective/Noun", "Somebody %s Adjective", "Somebody %s something", "Somebody %s somebody", "Something %s somebody", "Something %s something", "Something %s to somebody", "Somebody %s on something", "Somebody %s somebody something", "Somebody %s something to somebody", "Somebody %s something from somebody", "Somebody %s somebody with something", "Somebody %s somebody of something", "Somebody %s something on somebody", "Somebody %s somebody PP", "Somebody %s something PP", "Somebody %s PP", "Somebody's (body part) %s", "Somebody %s somebody to INFINITIVE", "Somebody %s somebody INFINITIVE", "Somebody %s that CLAUSE", "Somebody %s to somebody", "Somebody %s to INFINITIVE", "Somebody %s whether INFINITIVE", "Somebody %s somebody into V-ing something", "Somebody %s something with something", "Somebody %s INFINITIVE", "Somebody %s VERB-ing", "It %s that CLAUSE", "Something %s INFINITIVE") SENSENUM_RE = re.compile(r'\.\d\d\.') ###################################################################### ## Data Classes ###################################################################### class WordNetError(Exception): """An exception class for wordnet-related errors.""" @total_ordering class _WordNetObject(object): """A common base class for lemmas and synsets.""" def hypernyms(self): return self._related('@') def _hypernyms(self): return self._related('@', sort=False) def instance_hypernyms(self): return self._related('@i') def _instance_hypernyms(self): return self._related('@i', sort=False) def hyponyms(self): return self._related('~') def instance_hyponyms(self): return self._related('~i') def member_holonyms(self): return self._related('#m') def substance_holonyms(self): return self._related('#s') def part_holonyms(self): return self._related('#p') def member_meronyms(self): return self._related('%m') def substance_meronyms(self): return self._related('%s') def part_meronyms(self): return self._related('%p') def topic_domains(self): return self._related(';c') def region_domains(self): return self._related(';r') def usage_domains(self): return self._related(';u') def attributes(self): return self._related('=') def entailments(self): return self._related('*') def causes(self): return self._related('>') def also_sees(self): return self._related('^') def verb_groups(self): return self._related('$') def similar_tos(self): return self._related('&') def __hash__(self): return hash(self._name) def __eq__(self, other): return self._name == other._name def __ne__(self, other): return self._name != other._name def __lt__(self, other): return self._name < other._name @python_2_unicode_compatible class Lemma(_WordNetObject): """ The lexical entry for a single morphological form of a sense-disambiguated word. Create a Lemma from a "..." string where: is the morphological stem identifying the synset is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB is the sense number, counting from 0. is the morphological form of interest Note that and can be different, e.g. the Synset 'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and 'salt.n.03.salinity'. Lemma attributes, accessible via methods with the same name:: - name: The canonical name of this lemma. - synset: The synset that this lemma belongs to. - syntactic_marker: For adjectives, the WordNet string identifying the syntactic position relative modified noun. See: http://wordnet.princeton.edu/man/wninput.5WN.html#sect10 For all other parts of speech, this attribute is None. - count: The frequency of this lemma in wordnet. Lemma methods: Lemmas have the following methods for retrieving related Lemmas. They correspond to the names for the pointer symbols defined here: http://wordnet.princeton.edu/man/wninput.5WN.html#sect3 These methods all return lists of Lemmas: - antonyms - hypernyms, instance_hypernyms - hyponyms, instance_hyponyms - member_holonyms, substance_holonyms, part_holonyms - member_meronyms, substance_meronyms, part_meronyms - topic_domains, region_domains, usage_domains - attributes - derivationally_related_forms - entailments - causes - also_sees - verb_groups - similar_tos - pertainyms """ __slots__ = ['_wordnet_corpus_reader', '_name', '_syntactic_marker', '_synset', '_frame_strings', '_frame_ids', '_lexname_index', '_lex_id', '_lang', '_key'] def __init__(self, wordnet_corpus_reader, synset, name, lexname_index, lex_id, syntactic_marker): self._wordnet_corpus_reader = wordnet_corpus_reader self._name = name self._syntactic_marker = syntactic_marker self._synset = synset self._frame_strings = [] self._frame_ids = [] self._lexname_index = lexname_index self._lex_id = lex_id self._lang = 'eng' self._key = None # gets set later. def name(self): return self._name def syntactic_marker(self): return self._syntactic_marker def synset(self): return self._synset def frame_strings(self): return self._frame_strings def frame_ids(self): return self._frame_ids def lang(self): return self._lang def key(self): return self._key def __repr__(self): tup = type(self).__name__, self._synset._name, self._name return "%s('%s.%s')" % tup def _related(self, relation_symbol): get_synset = self._wordnet_corpus_reader._synset_from_pos_and_offset return sorted([get_synset(pos, offset)._lemmas[lemma_index] for pos, offset, lemma_index in self._synset._lemma_pointers[self._name, relation_symbol]]) def count(self): """Return the frequency count for this Lemma""" return self._wordnet_corpus_reader.lemma_count(self) def antonyms(self): return self._related('!') def derivationally_related_forms(self): return self._related('+') def pertainyms(self): return self._related('\\') @python_2_unicode_compatible class Synset(_WordNetObject): """Create a Synset from a ".." string where: is the word's morphological stem is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB is the sense number, counting from 0. Synset attributes, accessible via methods with the same name: - name: The canonical name of this synset, formed using the first lemma of this synset. Note that this may be different from the name passed to the constructor if that string used a different lemma to identify the synset. - pos: The synset's part of speech, matching one of the module level attributes ADJ, ADJ_SAT, ADV, NOUN or VERB. - lemmas: A list of the Lemma objects for this synset. - definition: The definition for this synset. - examples: A list of example strings for this synset. - offset: The offset in the WordNet dict file of this synset. - lexname: The name of the lexicographer file containing this synset. Synset methods: Synsets have the following methods for retrieving related Synsets. They correspond to the names for the pointer symbols defined here: http://wordnet.princeton.edu/man/wninput.5WN.html#sect3 These methods all return lists of Synsets. - hypernyms, instance_hypernyms - hyponyms, instance_hyponyms - member_holonyms, substance_holonyms, part_holonyms - member_meronyms, substance_meronyms, part_meronyms - attributes - entailments - causes - also_sees - verb_groups - similar_tos Additionally, Synsets support the following methods specific to the hypernym relation: - root_hypernyms - common_hypernyms - lowest_common_hypernyms Note that Synsets do not support the following relations because these are defined by WordNet as lexical relations: - antonyms - derivationally_related_forms - pertainyms """ __slots__ = ['_pos', '_offset', '_name', '_frame_ids', '_lemmas', '_lemma_names', '_definition', '_examples', '_lexname', '_pointers', '_lemma_pointers', '_max_depth', '_min_depth'] def __init__(self, wordnet_corpus_reader): self._wordnet_corpus_reader = wordnet_corpus_reader # All of these attributes get initialized by # WordNetCorpusReader._synset_from_pos_and_line() self._pos = None self._offset = None self._name = None self._frame_ids = [] self._lemmas = [] self._lemma_names = [] self._definition = None self._examples = [] self._lexname = None # lexicographer name self._all_hypernyms = None self._pointers = defaultdict(set) self._lemma_pointers = defaultdict(set) def pos(self): return self._pos def offset(self): return self._offset def name(self): return self._name def frame_ids(self): return self._frame_ids def definition(self): return self._definition def examples(self): return self._examples def lexname(self): return self._lexname def _needs_root(self): if self._pos == NOUN: if self._wordnet_corpus_reader.get_version() == '1.6': return True else: return False elif self._pos == VERB: return True def lemma_names(self, lang='eng'): '''Return all the lemma_names associated with the synset''' if lang=='eng': return self._lemma_names else: self._wordnet_corpus_reader._load_lang_data(lang) i = self._wordnet_corpus_reader.ss2of(self) if i in self._wordnet_corpus_reader._lang_data[lang][0]: return self._wordnet_corpus_reader._lang_data[lang][0][i] else: return [] def lemmas(self, lang='eng'): '''Return all the lemma objects associated with the synset''' if lang=='eng': return self._lemmas else: self._wordnet_corpus_reader._load_lang_data(lang) lemmark = [] lemmy = self.lemma_names(lang) for lem in lemmy: temp= Lemma(self._wordnet_corpus_reader, self, lem, self._wordnet_corpus_reader._lexnames.index(self.lexname()), 0, None) temp._lang=lang lemmark.append(temp) return lemmark def root_hypernyms(self): """Get the topmost hypernyms of this synset in WordNet.""" result = [] seen = set() todo = [self] while todo: next_synset = todo.pop() if next_synset not in seen: seen.add(next_synset) next_hypernyms = next_synset.hypernyms() + \ next_synset.instance_hypernyms() if not next_hypernyms: result.append(next_synset) else: todo.extend(next_hypernyms) return result # Simpler implementation which makes incorrect assumption that # hypernym hierarchy is acyclic: # # if not self.hypernyms(): # return [self] # else: # return list(set(root for h in self.hypernyms() # for root in h.root_hypernyms())) def max_depth(self): """ :return: The length of the longest hypernym path from this synset to the root. """ if "_max_depth" not in self.__dict__: hypernyms = self.hypernyms() + self.instance_hypernyms() if not hypernyms: self._max_depth = 0 else: self._max_depth = 1 + max(h.max_depth() for h in hypernyms) return self._max_depth def min_depth(self): """ :return: The length of the shortest hypernym path from this synset to the root. """ if "_min_depth" not in self.__dict__: hypernyms = self.hypernyms() + self.instance_hypernyms() if not hypernyms: self._min_depth = 0 else: self._min_depth = 1 + min(h.min_depth() for h in hypernyms) return self._min_depth def closure(self, rel, depth=-1): """Return the transitive closure of source under the rel relationship, breadth-first >>> from nltk.corpus import wordnet as wn >>> dog = wn.synset('dog.n.01') >>> hyp = lambda s:s.hypernyms() >>> list(dog.closure(hyp)) [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01')] """ from nltk.util import breadth_first synset_offsets = [] for synset in breadth_first(self, rel, depth): if synset._offset != self._offset: if synset._offset not in synset_offsets: synset_offsets.append(synset._offset) yield synset def hypernym_paths(self): """ Get the path(s) from this synset to the root, where each path is a list of the synset nodes traversed on the way to the root. :return: A list of lists, where each list gives the node sequence connecting the initial ``Synset`` node and a root node. """ paths = [] hypernyms = self.hypernyms() + self.instance_hypernyms() if len(hypernyms) == 0: paths = [[self]] for hypernym in hypernyms: for ancestor_list in hypernym.hypernym_paths(): ancestor_list.append(self) paths.append(ancestor_list) return paths def common_hypernyms(self, other): """ Find all synsets that are hypernyms of this synset and the other synset. :type other: Synset :param other: other input synset. :return: The synsets that are hypernyms of both synsets. """ if not self._all_hypernyms: self._all_hypernyms = set(self_synset for self_synsets in self._iter_hypernym_lists() for self_synset in self_synsets) if not other._all_hypernyms: other._all_hypernyms = set(other_synset for other_synsets in other._iter_hypernym_lists() for other_synset in other_synsets) return list(self._all_hypernyms.intersection(other._all_hypernyms)) def lowest_common_hypernyms(self, other, simulate_root=False, use_min_depth=False): """ Get a list of lowest synset(s) that both synsets have as a hypernym. When `use_min_depth == False` this means that the synset which appears as a hypernym of both `self` and `other` with the lowest maximum depth is returned or if there are multiple such synsets at the same depth they are all returned However, if `use_min_depth == True` then the synset(s) which has/have the lowest minimum depth and appear(s) in both paths is/are returned. By setting the use_min_depth flag to True, the behavior of NLTK2 can be preserved. This was changed in NLTK3 to give more accurate results in a small set of cases, generally with synsets concerning people. (eg: 'chef.n.01', 'fireman.n.01', etc.) This method is an implementation of Ted Pedersen's "Lowest Common Subsumer" method from the Perl Wordnet module. It can return either "self" or "other" if they are a hypernym of the other. :type other: Synset :param other: other input synset :type simulate_root: bool :param simulate_root: The various verb taxonomies do not share a single root which disallows this metric from working for synsets that are not connected. This flag (False by default) creates a fake root that connects all the taxonomies. Set it to True to enable this behavior. For the noun taxonomy, there is usually a default root except for WordNet version 1.6. If you are using wordnet 1.6, a fake root will need to be added for nouns as well. :type use_min_depth: bool :param use_min_depth: This setting mimics older (v2) behavior of NLTK wordnet If True, will use the min_depth function to calculate the lowest common hypernyms. This is known to give strange results for some synset pairs (eg: 'chef.n.01', 'fireman.n.01') but is retained for backwards compatibility :return: The synsets that are the lowest common hypernyms of both synsets """ synsets = self.common_hypernyms(other) if simulate_root: fake_synset = Synset(None) fake_synset._name = '*ROOT*' fake_synset.hypernyms = lambda: [] fake_synset.instance_hypernyms = lambda: [] synsets.append(fake_synset) try: if use_min_depth: max_depth = max(s.min_depth() for s in synsets) unsorted_lch = [s for s in synsets if s.min_depth() == max_depth] else: max_depth = max(s.max_depth() for s in synsets) unsorted_lch = [s for s in synsets if s.max_depth() == max_depth] return sorted(unsorted_lch) except ValueError: return [] def hypernym_distances(self, distance=0, simulate_root=False): """ Get the path(s) from this synset to the root, counting the distance of each node from the initial node on the way. A set of (synset, distance) tuples is returned. :type distance: int :param distance: the distance (number of edges) from this hypernym to the original hypernym ``Synset`` on which this method was called. :return: A set of ``(Synset, int)`` tuples where each ``Synset`` is a hypernym of the first ``Synset``. """ distances = set([(self, distance)]) for hypernym in self._hypernyms() + self._instance_hypernyms(): distances |= hypernym.hypernym_distances(distance+1, simulate_root=False) if simulate_root: fake_synset = Synset(None) fake_synset._name = '*ROOT*' fake_synset_distance = max(distances, key=itemgetter(1))[1] distances.add((fake_synset, fake_synset_distance+1)) return distances def _shortest_hypernym_paths(self, simulate_root): if self._name == '*ROOT*': return {self: 0} queue = deque([(self, 0)]) path = {} while queue: s, depth = queue.popleft() if s in path: continue path[s] = depth depth += 1 queue.extend((hyp, depth) for hyp in s._hypernyms()) queue.extend((hyp, depth) for hyp in s._instance_hypernyms()) if simulate_root: fake_synset = Synset(None) fake_synset._name = '*ROOT*' path[fake_synset] = max(path.values()) + 1 return path def shortest_path_distance(self, other, simulate_root=False): """ Returns the distance of the shortest path linking the two synsets (if one exists). For each synset, all the ancestor nodes and their distances are recorded and compared. The ancestor node common to both synsets that can be reached with the minimum number of traversals is used. If no ancestor nodes are common, None is returned. If a node is compared with itself 0 is returned. :type other: Synset :param other: The Synset to which the shortest path will be found. :return: The number of edges in the shortest path connecting the two nodes, or None if no path exists. """ if self == other: return 0 dist_dict1 = self._shortest_hypernym_paths(simulate_root) dist_dict2 = other._shortest_hypernym_paths(simulate_root) # For each ancestor synset common to both subject synsets, find the # connecting path length. Return the shortest of these. inf = float('inf') path_distance = inf for synset, d1 in iteritems(dist_dict1): d2 = dist_dict2.get(synset, inf) path_distance = min(path_distance, d1 + d2) return None if math.isinf(path_distance) else path_distance def tree(self, rel, depth=-1, cut_mark=None): """ >>> from nltk.corpus import wordnet as wn >>> dog = wn.synset('dog.n.01') >>> hyp = lambda s:s.hypernyms() >>> from pprint import pprint >>> pprint(dog.tree(hyp)) [Synset('dog.n.01'), [Synset('canine.n.02'), [Synset('carnivore.n.01'), [Synset('placental.n.01'), [Synset('mammal.n.01'), [Synset('vertebrate.n.01'), [Synset('chordate.n.01'), [Synset('animal.n.01'), [Synset('organism.n.01'), [Synset('living_thing.n.01'), [Synset('whole.n.02'), [Synset('object.n.01'), [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]]]]]], [Synset('domestic_animal.n.01'), [Synset('animal.n.01'), [Synset('organism.n.01'), [Synset('living_thing.n.01'), [Synset('whole.n.02'), [Synset('object.n.01'), [Synset('physical_entity.n.01'), [Synset('entity.n.01')]]]]]]]]] """ tree = [self] if depth != 0: tree += [x.tree(rel, depth-1, cut_mark) for x in rel(self)] elif cut_mark: tree += [cut_mark] return tree # interface to similarity methods def path_similarity(self, other, verbose=False, simulate_root=True): """ Path Distance Similarity: Return a score denoting how similar two word senses are, based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1, except in those cases where a path cannot be found (will only be true for verbs as there are many distinct verb taxonomies), in which case None is returned. A score of 1 represents identity i.e. comparing a sense with itself will return 1. :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type simulate_root: bool :param simulate_root: The various verb taxonomies do not share a single root which disallows this metric from working for synsets that are not connected. This flag (True by default) creates a fake root that connects all the taxonomies. Set it to false to disable this behavior. For the noun taxonomy, there is usually a default root except for WordNet version 1.6. If you are using wordnet 1.6, a fake root will be added for nouns as well. :return: A score denoting the similarity of the two ``Synset`` objects, normally between 0 and 1. None is returned if no connecting path could be found. 1 is returned if a ``Synset`` is compared with itself. """ distance = self.shortest_path_distance(other, simulate_root=simulate_root and self._needs_root()) if distance is None or distance < 0: return None return 1.0 / (distance + 1) def lch_similarity(self, other, verbose=False, simulate_root=True): """ Leacock Chodorow Similarity: Return a score denoting how similar two word senses are, based on the shortest path that connects the senses (as above) and the maximum depth of the taxonomy in which the senses occur. The relationship is given as -log(p/2d) where p is the shortest path length and d is the taxonomy depth. :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type simulate_root: bool :param simulate_root: The various verb taxonomies do not share a single root which disallows this metric from working for synsets that are not connected. This flag (True by default) creates a fake root that connects all the taxonomies. Set it to false to disable this behavior. For the noun taxonomy, there is usually a default root except for WordNet version 1.6. If you are using wordnet 1.6, a fake root will be added for nouns as well. :return: A score denoting the similarity of the two ``Synset`` objects, normally greater than 0. None is returned if no connecting path could be found. If a ``Synset`` is compared with itself, the maximum score is returned, which varies depending on the taxonomy depth. """ if self._pos != other._pos: raise WordNetError('Computing the lch similarity requires ' + \ '%s and %s to have the same part of speech.' % \ (self, other)) need_root = self._needs_root() if self._pos not in self._wordnet_corpus_reader._max_depth: self._wordnet_corpus_reader._compute_max_depth(self._pos, need_root) depth = self._wordnet_corpus_reader._max_depth[self._pos] distance = self.shortest_path_distance(other, simulate_root=simulate_root and need_root) if distance is None or distance < 0 or depth == 0: return None return -math.log((distance + 1) / (2.0 * depth)) def wup_similarity(self, other, verbose=False, simulate_root=True): """ Wu-Palmer Similarity: Return a score denoting how similar two word senses are, based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node). Previously, the scores computed by this implementation did _not_ always agree with those given by Pedersen's Perl implementation of WordNet Similarity. However, with the addition of the simulate_root flag (see below), the score for verbs now almost always agree but not always for nouns. The LCS does not necessarily feature in the shortest path connecting the two senses, as it is by definition the common ancestor deepest in the taxonomy, not closest to the two senses. Typically, however, it will so feature. Where multiple candidates for the LCS exist, that whose shortest path to the root node is the longest will be selected. Where the LCS has multiple paths to the root, the longer path is used for the purposes of the calculation. :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type simulate_root: bool :param simulate_root: The various verb taxonomies do not share a single root which disallows this metric from working for synsets that are not connected. This flag (True by default) creates a fake root that connects all the taxonomies. Set it to false to disable this behavior. For the noun taxonomy, there is usually a default root except for WordNet version 1.6. If you are using wordnet 1.6, a fake root will be added for nouns as well. :return: A float score denoting the similarity of the two ``Synset`` objects, normally greater than zero. If no connecting path between the two senses can be found, None is returned. """ need_root = self._needs_root() # Note that to preserve behavior from NLTK2 we set use_min_depth=True # It is possible that more accurate results could be obtained by # removing this setting and it should be tested later on subsumers = self.lowest_common_hypernyms(other, simulate_root=simulate_root and need_root, use_min_depth=True) # If no LCS was found return None if len(subsumers) == 0: return None subsumer = subsumers[0] # Get the longest path from the LCS to the root, # including a correction: # - add one because the calculations include both the start and end # nodes depth = subsumer.max_depth() + 1 # Note: No need for an additional add-one correction for non-nouns # to account for an imaginary root node because that is now automatically # handled by simulate_root # if subsumer._pos != NOUN: # depth += 1 # Get the shortest path from the LCS to each of the synsets it is # subsuming. Add this to the LCS path length to get the path # length from each synset to the root. len1 = self.shortest_path_distance(subsumer, simulate_root=simulate_root and need_root) len2 = other.shortest_path_distance(subsumer, simulate_root=simulate_root and need_root) if len1 is None or len2 is None: return None len1 += depth len2 += depth return (2.0 * depth) / (len1 + len2) def res_similarity(self, other, ic, verbose=False): """ Resnik Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects. Synsets whose LCS is the root node of the taxonomy will have a score of 0 (e.g. N['dog'][0] and N['table'][0]). """ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) return lcs_ic def jcn_similarity(self, other, ic, verbose=False): """ Jiang-Conrath Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects. """ if self == other: return _INF ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) # If either of the input synsets are the root synset, or have a # frequency of 0 (sparse data problem), return 0. if ic1 == 0 or ic2 == 0: return 0 ic_difference = ic1 + ic2 - 2 * lcs_ic if ic_difference == 0: return _INF return 1 / ic_difference def lin_similarity(self, other, ic, verbose=False): """ Lin Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). :type other: Synset :param other: The ``Synset`` that this ``Synset`` is being compared to. :type ic: dict :param ic: an information content object (as returned by ``nltk.corpus.wordnet_ic.ic()``). :return: A float score denoting the similarity of the two ``Synset`` objects, in the range 0 to 1. """ ic1, ic2, lcs_ic = _lcs_ic(self, other, ic) return (2.0 * lcs_ic) / (ic1 + ic2) def _iter_hypernym_lists(self): """ :return: An iterator over ``Synset`` objects that are either proper hypernyms or instance of hypernyms of the synset. """ todo = [self] seen = set() while todo: for synset in todo: seen.add(synset) yield todo todo = [hypernym for synset in todo for hypernym in (synset.hypernyms() + synset.instance_hypernyms()) if hypernym not in seen] def __repr__(self): return "%s('%s')" % (type(self).__name__, self._name) def _related(self, relation_symbol, sort=True): get_synset = self._wordnet_corpus_reader._synset_from_pos_and_offset pointer_tuples = self._pointers[relation_symbol] r = [get_synset(pos, offset) for pos, offset in pointer_tuples] if sort: r.sort() return r ###################################################################### ## WordNet Corpus Reader ###################################################################### class WordNetCorpusReader(CorpusReader): """ A corpus reader used to access wordnet or its variants. """ _ENCODING = 'utf8' #{ Part-of-speech constants ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v' #} #{ Filename constants _FILEMAP = {ADJ: 'adj', ADV: 'adv', NOUN: 'noun', VERB: 'verb'} #} #{ Part of speech constants _pos_numbers = {NOUN: 1, VERB: 2, ADJ: 3, ADV: 4, ADJ_SAT: 5} _pos_names = dict(tup[::-1] for tup in _pos_numbers.items()) #} #: A list of file identifiers for all the fileids used by this #: corpus reader. _FILES = ('cntlist.rev', 'lexnames', 'index.sense', 'index.adj', 'index.adv', 'index.noun', 'index.verb', 'data.adj', 'data.adv', 'data.noun', 'data.verb', 'adj.exc', 'adv.exc', 'noun.exc', 'verb.exc', ) def __init__(self, root, omw_reader): """ Construct a new wordnet corpus reader, with the given root directory. """ super(WordNetCorpusReader, self).__init__(root, self._FILES, encoding=self._ENCODING) # A index that provides the file offset # Map from lemma -> pos -> synset_index -> offset self._lemma_pos_offset_map = defaultdict(dict) # A cache so we don't have to reconstuct synsets # Map from pos -> offset -> synset self._synset_offset_cache = defaultdict(dict) # A lookup for the maximum depth of each part of speech. Useful for # the lch similarity metric. self._max_depth = defaultdict(dict) # Corpus reader containing omw data. self._omw_reader = omw_reader # A cache to store the wordnet data of multiple languages self._lang_data = defaultdict(list) self._data_file_map = {} self._exception_map = {} self._lexnames = [] self._key_count_file = None self._key_synset_file = None # Load the lexnames for i, line in enumerate(self.open('lexnames')): index, lexname, _ = line.split() assert int(index) == i self._lexnames.append(lexname) # Load the indices for lemmas and synset offsets self._load_lemma_pos_offset_map() # load the exception file data into memory self._load_exception_map() # Open Multilingual WordNet functions, contributed by # Nasruddin A’aidil Shari, Sim Wei Ying Geraldine, and Soe Lynn def of2ss(self, of): ''' take an id and return the synsets ''' return self._synset_from_pos_and_offset(of[-1], int(of[:8])) def ss2of(self, ss): ''' return the ID of the synset ''' return ("{:08d}-{}".format(ss.offset(), ss.pos())) def _load_lang_data(self, lang): ''' load the wordnet data of the requested language from the file to the cache, _lang_data ''' if lang not in self.langs(): raise WordNetError("Language is not supported.") if lang in self._lang_data.keys(): return f = self._omw_reader.open('{0:}/wn-data-{0:}.tab'.format(lang)) self._lang_data[lang].append(defaultdict(list)) self._lang_data[lang].append(defaultdict(list)) for l in f.readlines(): l = l.replace('\n', '') l = l.replace(' ', '_') if l[0] != '#': word = l.split('\t') self._lang_data[lang][0][word[0]].append(word[2]) self._lang_data[lang][1][word[2]].append(word[0]) f.close() def langs(self): ''' return a list of languages supported by Multilingual Wordnet ''' import os langs = [ 'eng' ] fileids = self._omw_reader.fileids() for fileid in fileids: file_name, file_extension = os.path.splitext(fileid) if file_extension == '.tab': langs.append(file_name.split('-')[-1]) return langs def _load_lemma_pos_offset_map(self): for suffix in self._FILEMAP.values(): # parse each line of the file (ignoring comment lines) for i, line in enumerate(self.open('index.%s' % suffix)): if line.startswith(' '): continue _iter = iter(line.split()) _next_token = lambda: next(_iter) try: # get the lemma and part-of-speech lemma = _next_token() pos = _next_token() # get the number of synsets for this lemma n_synsets = int(_next_token()) assert n_synsets > 0 # get the pointer symbols for all synsets of this lemma n_pointers = int(_next_token()) _ = [_next_token() for _ in xrange(n_pointers)] # same as number of synsets n_senses = int(_next_token()) assert n_synsets == n_senses # get number of senses ranked according to frequency _ = int(_next_token()) # get synset offsets synset_offsets = [int(_next_token()) for _ in xrange(n_synsets)] # raise more informative error with file name and line number except (AssertionError, ValueError) as e: tup = ('index.%s' % suffix), (i + 1), e raise WordNetError('file %s, line %i: %s' % tup) # map lemmas and parts of speech to synsets self._lemma_pos_offset_map[lemma][pos] = synset_offsets if pos == ADJ: self._lemma_pos_offset_map[lemma][ADJ_SAT] = synset_offsets def _load_exception_map(self): # load the exception file data into memory for pos, suffix in self._FILEMAP.items(): self._exception_map[pos] = {} for line in self.open('%s.exc' % suffix): terms = line.split() self._exception_map[pos][terms[0]] = terms[1:] self._exception_map[ADJ_SAT] = self._exception_map[ADJ] def _compute_max_depth(self, pos, simulate_root): """ Compute the max depth for the given part of speech. This is used by the lch similarity metric. """ depth = 0 for ii in self.all_synsets(pos): try: depth = max(depth, ii.max_depth()) except RuntimeError: print(ii) if simulate_root: depth += 1 self._max_depth[pos] = depth def get_version(self): fh = self._data_file(ADJ) for line in fh: match = re.search(r'WordNet (\d+\.\d+) Copyright', line) if match is not None: version = match.group(1) fh.seek(0) return version #//////////////////////////////////////////////////////////// # Loading Lemmas #//////////////////////////////////////////////////////////// def lemma(self, name, lang='eng'): '''Return lemma object that matches the name''' # cannot simply split on first '.', e.g.: '.45_caliber.a.01..45_caliber' separator = SENSENUM_RE.search(name).start() synset_name, lemma_name = name[:separator+3], name[separator+4:] synset = self.synset(synset_name) for lemma in synset.lemmas(lang): if lemma._name == lemma_name: return lemma raise WordNetError('no lemma %r in %r' % (lemma_name, synset_name)) def lemma_from_key(self, key): # Keys are case sensitive and always lower-case key = key.lower() lemma_name, lex_sense = key.split('%') pos_number, lexname_index, lex_id, _, _ = lex_sense.split(':') pos = self._pos_names[int(pos_number)] # open the key -> synset file if necessary if self._key_synset_file is None: self._key_synset_file = self.open('index.sense') # Find the synset for the lemma. synset_line = _binary_search_file(self._key_synset_file, key) if not synset_line: raise WordNetError("No synset found for key %r" % key) offset = int(synset_line.split()[1]) synset = self._synset_from_pos_and_offset(pos, offset) # return the corresponding lemma for lemma in synset._lemmas: if lemma._key == key: return lemma raise WordNetError("No lemma found for for key %r" % key) #//////////////////////////////////////////////////////////// # Loading Synsets #//////////////////////////////////////////////////////////// def synset(self, name): # split name into lemma, part of speech and synset number lemma, pos, synset_index_str = name.lower().rsplit('.', 2) synset_index = int(synset_index_str) - 1 # get the offset for this synset try: offset = self._lemma_pos_offset_map[lemma][pos][synset_index] except KeyError: message = 'no lemma %r with part of speech %r' raise WordNetError(message % (lemma, pos)) except IndexError: n_senses = len(self._lemma_pos_offset_map[lemma][pos]) message = "lemma %r with part of speech %r has only %i %s" if n_senses == 1: tup = lemma, pos, n_senses, "sense" else: tup = lemma, pos, n_senses, "senses" raise WordNetError(message % tup) # load synset information from the appropriate file synset = self._synset_from_pos_and_offset(pos, offset) # some basic sanity checks on loaded attributes if pos == 's' and synset._pos == 'a': message = ('adjective satellite requested but only plain ' 'adjective found for lemma %r') raise WordNetError(message % lemma) assert synset._pos == pos or (pos == 'a' and synset._pos == 's') # Return the synset object. return synset def _data_file(self, pos): """ Return an open file pointer for the data file for the given part of speech. """ if pos == ADJ_SAT: pos = ADJ if self._data_file_map.get(pos) is None: fileid = 'data.%s' % self._FILEMAP[pos] self._data_file_map[pos] = self.open(fileid) return self._data_file_map[pos] def _synset_from_pos_and_offset(self, pos, offset): # Check to see if the synset is in the cache if offset in self._synset_offset_cache[pos]: return self._synset_offset_cache[pos][offset] data_file = self._data_file(pos) data_file.seek(offset) data_file_line = data_file.readline() synset = self._synset_from_pos_and_line(pos, data_file_line) assert synset._offset == offset self._synset_offset_cache[pos][offset] = synset return synset def _synset_from_pos_and_line(self, pos, data_file_line): # Construct a new (empty) synset. synset = Synset(self) # parse the entry for this synset try: # parse out the definitions and examples from the gloss columns_str, gloss = data_file_line.split('|') gloss = gloss.strip() definitions = [] for gloss_part in gloss.split(';'): gloss_part = gloss_part.strip() if gloss_part.startswith('"'): synset._examples.append(gloss_part.strip('"')) else: definitions.append(gloss_part) synset._definition = '; '.join(definitions) # split the other info into fields _iter = iter(columns_str.split()) _next_token = lambda: next(_iter) # get the offset synset._offset = int(_next_token()) # determine the lexicographer file name lexname_index = int(_next_token()) synset._lexname = self._lexnames[lexname_index] # get the part of speech synset._pos = _next_token() # create Lemma objects for each lemma n_lemmas = int(_next_token(), 16) for _ in xrange(n_lemmas): # get the lemma name lemma_name = _next_token() # get the lex_id (used for sense_keys) lex_id = int(_next_token(), 16) # If the lemma has a syntactic marker, extract it. m = re.match(r'(.*?)(\(.*\))?$', lemma_name) lemma_name, syn_mark = m.groups() # create the lemma object lemma = Lemma(self, synset, lemma_name, lexname_index, lex_id, syn_mark) synset._lemmas.append(lemma) synset._lemma_names.append(lemma._name) # collect the pointer tuples n_pointers = int(_next_token()) for _ in xrange(n_pointers): symbol = _next_token() offset = int(_next_token()) pos = _next_token() lemma_ids_str = _next_token() if lemma_ids_str == '0000': synset._pointers[symbol].add((pos, offset)) else: source_index = int(lemma_ids_str[:2], 16) - 1 target_index = int(lemma_ids_str[2:], 16) - 1 source_lemma_name = synset._lemmas[source_index]._name lemma_pointers = synset._lemma_pointers tups = lemma_pointers[source_lemma_name, symbol] tups.add((pos, offset, target_index)) # read the verb frames try: frame_count = int(_next_token()) except StopIteration: pass else: for _ in xrange(frame_count): # read the plus sign plus = _next_token() assert plus == '+' # read the frame and lemma number frame_number = int(_next_token()) frame_string_fmt = VERB_FRAME_STRINGS[frame_number] lemma_number = int(_next_token(), 16) # lemma number of 00 means all words in the synset if lemma_number == 0: synset._frame_ids.append(frame_number) for lemma in synset._lemmas: lemma._frame_ids.append(frame_number) lemma._frame_strings.append(frame_string_fmt % lemma._name) # only a specific word in the synset else: lemma = synset._lemmas[lemma_number - 1] lemma._frame_ids.append(frame_number) lemma._frame_strings.append(frame_string_fmt % lemma._name) # raise a more informative error with line text except ValueError as e: raise WordNetError('line %r: %s' % (data_file_line, e)) # set sense keys for Lemma objects - note that this has to be # done afterwards so that the relations are available for lemma in synset._lemmas: if synset._pos == ADJ_SAT: head_lemma = synset.similar_tos()[0]._lemmas[0] head_name = head_lemma._name head_id = '%02d' % head_lemma._lex_id else: head_name = head_id = '' tup = (lemma._name, WordNetCorpusReader._pos_numbers[synset._pos], lemma._lexname_index, lemma._lex_id, head_name, head_id) lemma._key = ('%s%%%d:%02d:%02d:%s:%s' % tup).lower() # the canonical name is based on the first lemma lemma_name = synset._lemmas[0]._name.lower() offsets = self._lemma_pos_offset_map[lemma_name][synset._pos] sense_index = offsets.index(synset._offset) tup = lemma_name, synset._pos, sense_index + 1 synset._name = '%s.%s.%02i' % tup return synset #//////////////////////////////////////////////////////////// # Retrieve synsets and lemmas. #//////////////////////////////////////////////////////////// def synsets(self, lemma, pos=None, lang='eng'): """Load all synsets with a given lemma and part of speech tag. If no pos is specified, all synsets for all parts of speech will be loaded. If lang is specified, all the synsets associated with the lemma name of that language will be returned. """ lemma = lemma.lower() if lang == 'eng': get_synset = self._synset_from_pos_and_offset index = self._lemma_pos_offset_map if pos is None: pos = POS_LIST return [get_synset(p, offset) for p in pos for form in self._morphy(lemma, p) for offset in index[form].get(p, [])] else: self._load_lang_data(lang) synset_list = [] for l in self._lang_data[lang][1][lemma]: if pos is not None and l[-1] != pos: continue synset_list.append(self.of2ss(l)) return synset_list def lemmas(self, lemma, pos=None, lang='eng'): """Return all Lemma objects with a name matching the specified lemma name and part of speech tag. Matches any part of speech tag if none is specified.""" if lang == 'eng': lemma = lemma.lower() return [lemma_obj for synset in self.synsets(lemma, pos) for lemma_obj in synset.lemmas() if lemma_obj.name().lower() == lemma] else: self._load_lang_data(lang) lemmas = [] syn = self.synsets(lemma, lang=lang) for s in syn: if pos is not None and s.pos() != pos: continue a = Lemma(self, s, lemma, self._lexnames.index(s.lexname()), 0, None) a._lang = lang lemmas.append(a) return lemmas def all_lemma_names(self, pos=None, lang='eng'): """Return all lemma names for all synsets for the given part of speech tag and langauge or languages. If pos is not specified, all synsets for all parts of speech will be used.""" if lang == 'eng': if pos is None: return iter(self._lemma_pos_offset_map) else: return (lemma for lemma in self._lemma_pos_offset_map if pos in self._lemma_pos_offset_map[lemma]) else: self._load_lang_data(lang) lemma = [] for i in self._lang_data[lang][0]: if pos is not None and i[-1] != pos: continue lemma.extend(self._lang_data[lang][0][i]) lemma = list(set(lemma)) return lemma def all_synsets(self, pos=None): """Iterate over all synsets with a given part of speech tag. If no pos is specified, all synsets for all parts of speech will be loaded. """ if pos is None: pos_tags = self._FILEMAP.keys() else: pos_tags = [pos] cache = self._synset_offset_cache from_pos_and_line = self._synset_from_pos_and_line # generate all synsets for each part of speech for pos_tag in pos_tags: # Open the file for reading. Note that we can not re-use # the file poitners from self._data_file_map here, because # we're defining an iterator, and those file pointers might # be moved while we're not looking. if pos_tag == ADJ_SAT: pos_tag = ADJ fileid = 'data.%s' % self._FILEMAP[pos_tag] data_file = self.open(fileid) try: # generate synsets for each line in the POS file offset = data_file.tell() line = data_file.readline() while line: if not line[0].isspace(): if offset in cache[pos_tag]: # See if the synset is cached synset = cache[pos_tag][offset] else: # Otherwise, parse the line synset = from_pos_and_line(pos_tag, line) cache[pos_tag][offset] = synset # adjective satellites are in the same file as # adjectives so only yield the synset if it's actually # a satellite if synset._pos == ADJ_SAT: yield synset # for all other POS tags, yield all synsets (this means # that adjectives also include adjective satellites) else: yield synset offset = data_file.tell() line = data_file.readline() # close the extra file handle we opened except: data_file.close() raise else: data_file.close() def words(self, lang='eng'): """return lemmas of the given language as list of words""" return self.all_lemma_names(lang=lang) def license(self, lang='eng'): """Return the contents of LICENSE (for omw) use lang=lang to get the license for an individual language""" if lang == 'eng': return self.open("LICENSE").read() elif lang in self.langs(): return self._omw_reader.open("{}/LICENSE".format(lang)).read() elif lang == 'omw': ### under the not unreasonable assumption you don't mean Omwunra-Toqura return self._omw_reader.open("LICENSE").read() else: raise WordNetError("Language is not supported.") def readme(self, lang='omw'): """Return the contents of README (for omw) use lang=lang to get the readme for an individual language""" if lang == 'eng': return self.open("README").read() elif lang in self.langs(): return self._omw_reader.open("{}/README".format(lang)).read() elif lang == 'omw': ### under the not unreasonable assumption you don't mean Omwunra-Toqura return self._omw_reader.open("README").read() else: raise WordNetError("Language is not supported.") def citation(self, lang='omw'): """Return the contents of citation.bib file (for omw) use lang=lang to get the citation for an individual language""" if lang == 'eng': return self.open("citation.bib").read() elif lang in self.langs(): return self._omw_reader.open("{}/citation.bib".format(lang)).read() elif lang == 'omw': ### under the not unreasonable assumption you don't mean Omwunra-Toqura return self._omw_reader.open("citation.bib").read() else: raise WordNetError("Language is not supported.") #//////////////////////////////////////////////////////////// # Misc #//////////////////////////////////////////////////////////// def lemma_count(self, lemma): """Return the frequency count for this Lemma""" # Currently, count is only work for English if lemma._lang != 'eng': return 0 # open the count file if we haven't already if self._key_count_file is None: self._key_count_file = self.open('cntlist.rev') # find the key in the counts file and return the count line = _binary_search_file(self._key_count_file, lemma._key) if line: return int(line.rsplit(' ', 1)[-1]) else: return 0 def path_similarity(self, synset1, synset2, verbose=False, simulate_root=True): return synset1.path_similarity(synset2, verbose, simulate_root) path_similarity.__doc__ = Synset.path_similarity.__doc__ def lch_similarity(self, synset1, synset2, verbose=False, simulate_root=True): return synset1.lch_similarity(synset2, verbose, simulate_root) lch_similarity.__doc__ = Synset.lch_similarity.__doc__ def wup_similarity(self, synset1, synset2, verbose=False, simulate_root=True): return synset1.wup_similarity(synset2, verbose, simulate_root) wup_similarity.__doc__ = Synset.wup_similarity.__doc__ def res_similarity(self, synset1, synset2, ic, verbose=False): return synset1.res_similarity(synset2, ic, verbose) res_similarity.__doc__ = Synset.res_similarity.__doc__ def jcn_similarity(self, synset1, synset2, ic, verbose=False): return synset1.jcn_similarity(synset2, ic, verbose) jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ def lin_similarity(self, synset1, synset2, ic, verbose=False): return synset1.lin_similarity(synset2, ic, verbose) lin_similarity.__doc__ = Synset.lin_similarity.__doc__ #//////////////////////////////////////////////////////////// # Morphy #//////////////////////////////////////////////////////////// # Morphy, adapted from Oliver Steele's pywordnet def morphy(self, form, pos=None): """ Find a possible base form for the given form, with the given part of speech, by checking WordNet's list of exceptional forms, and by recursively stripping affixes for this part of speech until a form in WordNet is found. >>> from nltk.corpus import wordnet as wn >>> print(wn.morphy('dogs')) dog >>> print(wn.morphy('churches')) church >>> print(wn.morphy('aardwolves')) aardwolf >>> print(wn.morphy('abaci')) abacus >>> wn.morphy('hardrock', wn.ADV) >>> print(wn.morphy('book', wn.NOUN)) book >>> wn.morphy('book', wn.ADJ) """ if pos is None: morphy = self._morphy analyses = chain(a for p in POS_LIST for a in morphy(form, p)) else: analyses = self._morphy(form, pos) # get the first one we find first = list(islice(analyses, 1)) if len(first) == 1: return first[0] else: return None MORPHOLOGICAL_SUBSTITUTIONS = { NOUN: [('s', ''), ('ses', 's'), ('ves', 'f'), ('xes', 'x'), ('zes', 'z'), ('ches', 'ch'), ('shes', 'sh'), ('men', 'man'), ('ies', 'y')], VERB: [('s', ''), ('ies', 'y'), ('es', 'e'), ('es', ''), ('ed', 'e'), ('ed', ''), ('ing', 'e'), ('ing', '')], ADJ: [('er', ''), ('est', ''), ('er', 'e'), ('est', 'e')], ADV: []} MORPHOLOGICAL_SUBSTITUTIONS[ADJ_SAT] = MORPHOLOGICAL_SUBSTITUTIONS[ADJ] def _morphy(self, form, pos): # from jordanbg: # Given an original string x # 1. Apply rules once to the input to get y1, y2, y3, etc. # 2. Return all that are in the database # 3. If there are no matches, keep applying rules until you either # find a match or you can't go any further exceptions = self._exception_map[pos] substitutions = self.MORPHOLOGICAL_SUBSTITUTIONS[pos] def apply_rules(forms): return [form[:-len(old)] + new for form in forms for old, new in substitutions if form.endswith(old)] def filter_forms(forms): result = [] seen = set() for form in forms: if form in self._lemma_pos_offset_map: if pos in self._lemma_pos_offset_map[form]: if form not in seen: result.append(form) seen.add(form) return result # 0. Check the exception lists if form in exceptions: return filter_forms([form] + exceptions[form]) # 1. Apply rules once to the input to get y1, y2, y3, etc. forms = apply_rules([form]) # 2. Return all that are in the database (and check the original too) results = filter_forms([form] + forms) if results: return results # 3. If there are no matches, keep applying rules until we find a match while forms: forms = apply_rules(forms) results = filter_forms(forms) if results: return results # Return an empty list if we can't find anything return [] #//////////////////////////////////////////////////////////// # Create information content from corpus #//////////////////////////////////////////////////////////// def ic(self, corpus, weight_senses_equally = False, smoothing = 1.0): """ Creates an information content lookup dictionary from a corpus. :type corpus: CorpusReader :param corpus: The corpus from which we create an information content dictionary. :type weight_senses_equally: bool :param weight_senses_equally: If this is True, gives all possible senses equal weight rather than dividing by the number of possible senses. (If a word has 3 synses, each sense gets 0.3333 per appearance when this is False, 1.0 when it is true.) :param smoothing: How much do we smooth synset counts (default is 1.0) :type smoothing: float :return: An information content dictionary """ counts = FreqDist() for ww in corpus.words(): counts[ww] += 1 ic = {} for pp in POS_LIST: ic[pp] = defaultdict(float) # Initialize the counts with the smoothing value if smoothing > 0.0: for ss in self.all_synsets(): pos = ss._pos if pos == ADJ_SAT: pos = ADJ ic[pos][ss._offset] = smoothing for ww in counts: possible_synsets = self.synsets(ww) if len(possible_synsets) == 0: continue # Distribute weight among possible synsets weight = float(counts[ww]) if not weight_senses_equally: weight /= float(len(possible_synsets)) for ss in possible_synsets: pos = ss._pos if pos == ADJ_SAT: pos = ADJ for level in ss._iter_hypernym_lists(): for hh in level: ic[pos][hh._offset] += weight # Add the weight to the root ic[pos][0] += weight return ic ###################################################################### ## WordNet Information Content Corpus Reader ###################################################################### class WordNetICCorpusReader(CorpusReader): """ A corpus reader for the WordNet information content corpus. """ def __init__(self, root, fileids): CorpusReader.__init__(self, root, fileids, encoding='utf8') # this load function would be more efficient if the data was pickled # Note that we can't use NLTK's frequency distributions because # synsets are overlapping (each instance of a synset also counts # as an instance of its hypernyms) def ic(self, icfile): """ Load an information content file from the wordnet_ic corpus and return a dictionary. This dictionary has just two keys, NOUN and VERB, whose values are dictionaries that map from synsets to information content values. :type icfile: str :param icfile: The name of the wordnet_ic file (e.g. "ic-brown.dat") :return: An information content dictionary """ ic = {} ic[NOUN] = defaultdict(float) ic[VERB] = defaultdict(float) for num, line in enumerate(self.open(icfile)): if num == 0: # skip the header continue fields = line.split() offset = int(fields[0][:-1]) value = float(fields[1]) pos = _get_pos(fields[0]) if len(fields) == 3 and fields[2] == "ROOT": # Store root count. ic[pos][0] += value if value != 0: ic[pos][offset] = value return ic ###################################################################### # Similarity metrics ###################################################################### # TODO: Add in the option to manually add a new root node; this will be # useful for verb similarity as there exist multiple verb taxonomies. # More information about the metrics is available at # http://marimba.d.umn.edu/similarity/measures.html def path_similarity(synset1, synset2, verbose=False, simulate_root=True): return synset1.path_similarity(synset2, verbose, simulate_root) path_similarity.__doc__ = Synset.path_similarity.__doc__ def lch_similarity(synset1, synset2, verbose=False, simulate_root=True): return synset1.lch_similarity(synset2, verbose, simulate_root) lch_similarity.__doc__ = Synset.lch_similarity.__doc__ def wup_similarity(synset1, synset2, verbose=False, simulate_root=True): return synset1.wup_similarity(synset2, verbose, simulate_root) wup_similarity.__doc__ = Synset.wup_similarity.__doc__ def res_similarity(synset1, synset2, ic, verbose=False): return synset1.res_similarity(synset2, verbose) res_similarity.__doc__ = Synset.res_similarity.__doc__ def jcn_similarity(synset1, synset2, ic, verbose=False): return synset1.jcn_similarity(synset2, verbose) jcn_similarity.__doc__ = Synset.jcn_similarity.__doc__ def lin_similarity(synset1, synset2, ic, verbose=False): return synset1.lin_similarity(synset2, verbose) lin_similarity.__doc__ = Synset.lin_similarity.__doc__ def _lcs_ic(synset1, synset2, ic, verbose=False): """ Get the information content of the least common subsumer that has the highest information content value. If two nodes have no explicit common subsumer, assume that they share an artificial root node that is the hypernym of all explicit roots. :type synset1: Synset :param synset1: First input synset. :type synset2: Synset :param synset2: Second input synset. Must be the same part of speech as the first synset. :type ic: dict :param ic: an information content object (as returned by ``load_ic()``). :return: The information content of the two synsets and their most informative subsumer """ if synset1._pos != synset2._pos: raise WordNetError('Computing the least common subsumer requires ' + \ '%s and %s to have the same part of speech.' % \ (synset1, synset2)) ic1 = information_content(synset1, ic) ic2 = information_content(synset2, ic) subsumers = synset1.common_hypernyms(synset2) if len(subsumers) == 0: subsumer_ic = 0 else: subsumer_ic = max(information_content(s, ic) for s in subsumers) if verbose: print("> LCS Subsumer by content:", subsumer_ic) return ic1, ic2, subsumer_ic # Utility functions def information_content(synset, ic): try: icpos = ic[synset._pos] except KeyError: msg = 'Information content file has no entries for part-of-speech: %s' raise WordNetError(msg % synset._pos) counts = icpos[synset._offset] if counts == 0: return _INF else: return -math.log(counts / icpos[0]) # get the part of speech (NOUN or VERB) from the information content record # (each identifier has a 'n' or 'v' suffix) def _get_pos(field): if field[-1] == 'n': return NOUN elif field[-1] == 'v': return VERB else: msg = "Unidentified part of speech in WordNet Information Content file for field %s" % field raise ValueError(msg) # unload corpus after tests def teardown_module(module=None): from nltk.corpus import wordnet wordnet._unload() ###################################################################### # Demo ###################################################################### def demo(): import nltk print('loading wordnet') wn = WordNetCorpusReader(nltk.data.find('corpora/wordnet'), None) print('done loading') S = wn.synset L = wn.lemma print('getting a synset for go') move_synset = S('go.v.21') print(move_synset.name(), move_synset.pos(), move_synset.lexname()) print(move_synset.lemma_names()) print(move_synset.definition()) print(move_synset.examples()) zap_n = ['zap.n.01'] zap_v = ['zap.v.01', 'zap.v.02', 'nuke.v.01', 'microwave.v.01'] def _get_synsets(synset_strings): return [S(synset) for synset in synset_strings] zap_n_synsets = _get_synsets(zap_n) zap_v_synsets = _get_synsets(zap_v) print(zap_n_synsets) print(zap_v_synsets) print("Navigations:") print(S('travel.v.01').hypernyms()) print(S('travel.v.02').hypernyms()) print(S('travel.v.03').hypernyms()) print(L('zap.v.03.nuke').derivationally_related_forms()) print(L('zap.v.03.atomize').derivationally_related_forms()) print(L('zap.v.03.atomise').derivationally_related_forms()) print(L('zap.v.03.zap').derivationally_related_forms()) print(S('dog.n.01').member_holonyms()) print(S('dog.n.01').part_meronyms()) print(S('breakfast.n.1').hypernyms()) print(S('meal.n.1').hyponyms()) print(S('Austen.n.1').instance_hypernyms()) print(S('composer.n.1').instance_hyponyms()) print(S('faculty.n.2').member_meronyms()) print(S('copilot.n.1').member_holonyms()) print(S('table.n.2').part_meronyms()) print(S('course.n.7').part_holonyms()) print(S('water.n.1').substance_meronyms()) print(S('gin.n.1').substance_holonyms()) print(L('leader.n.1.leader').antonyms()) print(L('increase.v.1.increase').antonyms()) print(S('snore.v.1').entailments()) print(S('heavy.a.1').similar_tos()) print(S('light.a.1').attributes()) print(S('heavy.a.1').attributes()) print(L('English.a.1.English').pertainyms()) print(S('person.n.01').root_hypernyms()) print(S('sail.v.01').root_hypernyms()) print(S('fall.v.12').root_hypernyms()) print(S('person.n.01').lowest_common_hypernyms(S('dog.n.01'))) print(S('woman.n.01').lowest_common_hypernyms(S('girlfriend.n.02'))) print(S('dog.n.01').path_similarity(S('cat.n.01'))) print(S('dog.n.01').lch_similarity(S('cat.n.01'))) print(S('dog.n.01').wup_similarity(S('cat.n.01'))) wnic = WordNetICCorpusReader(nltk.data.find('corpora/wordnet_ic'), '.*\.dat') ic = wnic.ic('ic-brown.dat') print(S('dog.n.01').jcn_similarity(S('cat.n.01'), ic)) ic = wnic.ic('ic-semcor.dat') print(S('dog.n.01').lin_similarity(S('cat.n.01'), ic)) print(S('code.n.03').topic_domains()) print(S('pukka.a.01').region_domains()) print(S('freaky.a.01').usage_domains()) if __name__ == '__main__': demo() nltk-3.1/nltk/corpus/reader/xmldocs.py0000644000076500000240000003727612607224144017645 0ustar sbstaff00000000000000# Natural Language Toolkit: XML Corpus Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ Corpus reader for corpora whose documents are xml files. (note -- not named 'xml' to avoid conflicting w/ standard xml package) """ from __future__ import print_function, unicode_literals import codecs # Use the c version of ElementTree, which is faster, if possible: try: from xml.etree import cElementTree as ElementTree except ImportError: from xml.etree import ElementTree from nltk import compat from nltk.data import SeekableUnicodeStreamReader from nltk.tokenize import WordPunctTokenizer from nltk.internals import ElementWrapper from nltk.corpus.reader.api import CorpusReader from nltk.corpus.reader.util import * class XMLCorpusReader(CorpusReader): """ Corpus reader for corpora whose documents are xml files. Note that the ``XMLCorpusReader`` constructor does not take an ``encoding`` argument, because the unicode encoding is specified by the XML files themselves. See the XML specs for more info. """ def __init__(self, root, fileids, wrap_etree=False): self._wrap_etree = wrap_etree CorpusReader.__init__(self, root, fileids) def xml(self, fileid=None): # Make sure we have exactly one file -- no concatenating XML. if fileid is None and len(self._fileids) == 1: fileid = self._fileids[0] if not isinstance(fileid, compat.string_types): raise TypeError('Expected a single file identifier string') # Read the XML in using ElementTree. elt = ElementTree.parse(self.abspath(fileid).open()).getroot() # If requested, wrap it. if self._wrap_etree: elt = ElementWrapper(elt) # Return the ElementTree element. return elt def words(self, fileid=None): """ Returns all of the words and punctuation symbols in the specified file that were in text nodes -- ie, tags are ignored. Like the xml() method, fileid can only specify one file. :return: the given file's text nodes as a list of words and punctuation symbols :rtype: list(str) """ elt = self.xml(fileid) encoding = self.encoding(fileid) word_tokenizer=WordPunctTokenizer() iterator = elt.getiterator() out = [] for node in iterator: text = node.text if text is not None: if isinstance(text, bytes): text = text.decode(encoding) toks = word_tokenizer.tokenize(text) out.extend(toks) return out def raw(self, fileids=None): if fileids is None: fileids = self._fileids elif isinstance(fileids, compat.string_types): fileids = [fileids] return concat([self.open(f).read() for f in fileids]) class XMLCorpusView(StreamBackedCorpusView): """ A corpus view that selects out specified elements from an XML file, and provides a flat list-like interface for accessing them. (Note: ``XMLCorpusView`` is not used by ``XMLCorpusReader`` itself, but may be used by subclasses of ``XMLCorpusReader``.) Every XML corpus view has a "tag specification", indicating what XML elements should be included in the view; and each (non-nested) element that matches this specification corresponds to one item in the view. Tag specifications are regular expressions over tag paths, where a tag path is a list of element tag names, separated by '/', indicating the ancestry of the element. Some examples: - ``'foo'``: A top-level element whose tag is ``foo``. - ``'foo/bar'``: An element whose tag is ``bar`` and whose parent is a top-level element whose tag is ``foo``. - ``'.*/foo'``: An element whose tag is ``foo``, appearing anywhere in the xml tree. - ``'.*/(foo|bar)'``: An wlement whose tag is ``foo`` or ``bar``, appearing anywhere in the xml tree. The view items are generated from the selected XML elements via the method ``handle_elt()``. By default, this method returns the element as-is (i.e., as an ElementTree object); but it can be overridden, either via subclassing or via the ``elt_handler`` constructor parameter. """ #: If true, then display debugging output to stdout when reading #: blocks. _DEBUG = False #: The number of characters read at a time by this corpus reader. _BLOCK_SIZE = 1024 def __init__(self, fileid, tagspec, elt_handler=None): """ Create a new corpus view based on a specified XML file. Note that the ``XMLCorpusView`` constructor does not take an ``encoding`` argument, because the unicode encoding is specified by the XML files themselves. :type tagspec: str :param tagspec: A tag specification, indicating what XML elements should be included in the view. Each non-nested element that matches this specification corresponds to one item in the view. :param elt_handler: A function used to transform each element to a value for the view. If no handler is specified, then ``self.handle_elt()`` is called, which returns the element as an ElementTree object. The signature of elt_handler is:: elt_handler(elt, tagspec) -> value """ if elt_handler: self.handle_elt = elt_handler self._tagspec = re.compile(tagspec+r'\Z') """The tag specification for this corpus view.""" self._tag_context = {0: ()} """A dictionary mapping from file positions (as returned by ``stream.seek()`` to XML contexts. An XML context is a tuple of XML tag names, indicating which tags have not yet been closed.""" encoding = self._detect_encoding(fileid) StreamBackedCorpusView.__init__(self, fileid, encoding=encoding) def _detect_encoding(self, fileid): if isinstance(fileid, PathPointer): s = fileid.open().readline() else: with open(fileid, 'rb') as infile: s = infile.readline() if s.startswith(codecs.BOM_UTF16_BE): return 'utf-16-be' if s.startswith(codecs.BOM_UTF16_LE): return 'utf-16-le' if s.startswith(codecs.BOM_UTF32_BE): return 'utf-32-be' if s.startswith(codecs.BOM_UTF32_LE): return 'utf-32-le' if s.startswith(codecs.BOM_UTF8): return 'utf-8' m = re.match(br'\s*<\?xml\b.*\bencoding="([^"]+)"', s) if m: return m.group(1).decode() m = re.match(br"\s*<\?xml\b.*\bencoding='([^']+)'", s) if m: return m.group(1).decode() # No encoding found -- what should the default be? return 'utf-8' def handle_elt(self, elt, context): """ Convert an element into an appropriate value for inclusion in the view. Unless overridden by a subclass or by the ``elt_handler`` constructor argument, this method simply returns ``elt``. :return: The view value corresponding to ``elt``. :type elt: ElementTree :param elt: The element that should be converted. :type context: str :param context: A string composed of element tags separated by forward slashes, indicating the XML context of the given element. For example, the string ``'foo/bar/baz'`` indicates that the element is a ``baz`` element whose parent is a ``bar`` element and whose grandparent is a top-level ``foo`` element. """ return elt #: A regular expression that matches XML fragments that do not #: contain any un-closed tags. _VALID_XML_RE = re.compile(r""" [^<]* ( (() | # comment () | # doctype decl (<[^!>][^>]*>)) # tag or PI [^<]*)* \Z""", re.DOTALL|re.VERBOSE) #: A regular expression used to extract the tag name from a start tag, #: end tag, or empty-elt tag string. _XML_TAG_NAME = re.compile('<\s*/?\s*([^\s>]+)') #: A regular expression used to find all start-tags, end-tags, and #: emtpy-elt tags in an XML file. This regexp is more lenient than #: the XML spec -- e.g., it allows spaces in some places where the #: spec does not. _XML_PIECE = re.compile(r""" # Include these so we can skip them: (?P )| (?P )| (?P <\?.*?\?> )| (?P ]*(\[[^\]]*])?\s*>)| # These are the ones we actually care about: (?P <\s*[^>/\?!\s][^>]*/\s*> )| (?P <\s*[^>/\?!\s][^>]*> )| (?P <\s*/[^>/\?!\s][^>]*> )""", re.DOTALL|re.VERBOSE) def _read_xml_fragment(self, stream): """ Read a string from the given stream that does not contain any un-closed tags. In particular, this function first reads a block from the stream of size ``self._BLOCK_SIZE``. It then checks if that block contains an un-closed tag. If it does, then this function either backtracks to the last '<', or reads another block. """ fragment = '' if isinstance(stream, SeekableUnicodeStreamReader): startpos = stream.tell() while True: # Read a block and add it to the fragment. xml_block = stream.read(self._BLOCK_SIZE) fragment += xml_block # Do we have a well-formed xml fragment? if self._VALID_XML_RE.match(fragment): return fragment # Do we have a fragment that will never be well-formed? if re.search('[<>]', fragment).group(0) == '>': pos = stream.tell() - ( len(fragment)-re.search('[<>]', fragment).end()) raise ValueError('Unexpected ">" near char %s' % pos) # End of file? if not xml_block: raise ValueError('Unexpected end of file: tag not closed') # If not, then we must be in the middle of a <..tag..>. # If appropriate, backtrack to the most recent '<' # character. last_open_bracket = fragment.rfind('<') if last_open_bracket > 0: if self._VALID_XML_RE.match(fragment[:last_open_bracket]): if isinstance(stream, SeekableUnicodeStreamReader): stream.seek(startpos) stream.char_seek_forward(last_open_bracket) else: stream.seek(-(len(fragment)-last_open_bracket), 1) return fragment[:last_open_bracket] # Otherwise, read another block. (i.e., return to the # top of the loop.) def read_block(self, stream, tagspec=None, elt_handler=None): """ Read from ``stream`` until we find at least one element that matches ``tagspec``, and return the result of applying ``elt_handler`` to each element found. """ if tagspec is None: tagspec = self._tagspec if elt_handler is None: elt_handler = self.handle_elt # Use a stack of strings to keep track of our context: context = list(self._tag_context.get(stream.tell())) assert context is not None # check this -- could it ever happen? elts = [] elt_start = None # where does the elt start elt_depth = None # what context depth elt_text = '' while elts==[] or elt_start is not None: if isinstance(stream, SeekableUnicodeStreamReader): startpos = stream.tell() xml_fragment = self._read_xml_fragment(stream) # End of file. if not xml_fragment: if elt_start is None: break else: raise ValueError('Unexpected end of file') # Process each in the xml fragment. for piece in self._XML_PIECE.finditer(xml_fragment): if self._DEBUG: print('%25s %s' % ('/'.join(context)[-20:], piece.group())) if piece.group('START_TAG'): name = self._XML_TAG_NAME.match(piece.group()).group(1) # Keep context up-to-date. context.append(name) # Is this one of the elts we're looking for? if elt_start is None: if re.match(tagspec, '/'.join(context)): elt_start = piece.start() elt_depth = len(context) elif piece.group('END_TAG'): name = self._XML_TAG_NAME.match(piece.group()).group(1) # sanity checks: if not context: raise ValueError('Unmatched tag ' % name) if name != context[-1]: raise ValueError('Unmatched tag <%s>...' % (context[-1], name)) # Is this the end of an element? if elt_start is not None and elt_depth == len(context): elt_text += xml_fragment[elt_start:piece.end()] elts.append( (elt_text, '/'.join(context)) ) elt_start = elt_depth = None elt_text = '' # Keep context up-to-date context.pop() elif piece.group('EMPTY_ELT_TAG'): name = self._XML_TAG_NAME.match(piece.group()).group(1) if elt_start is None: if re.match(tagspec, '/'.join(context)+'/'+name): elts.append((piece.group(), '/'.join(context)+'/'+name)) if elt_start is not None: # If we haven't found any elements yet, then keep # looping until we do. if elts == []: elt_text += xml_fragment[elt_start:] elt_start = 0 # If we've found at least one element, then try # backtracking to the start of the element that we're # inside of. else: # take back the last start-tag, and return what # we've gotten so far (elts is non-empty). if self._DEBUG: print(' '*36+'(backtrack)') if isinstance(stream, SeekableUnicodeStreamReader): stream.seek(startpos) stream.char_seek_forward(elt_start) else: stream.seek(-(len(xml_fragment)-elt_start), 1) context = context[:elt_depth-1] elt_start = elt_depth = None elt_text = '' # Update the _tag_context dict. pos = stream.tell() if pos in self._tag_context: assert tuple(context) == self._tag_context[pos] else: self._tag_context[pos] = tuple(context) return [elt_handler(ElementTree.fromstring( elt.encode('ascii', 'xmlcharrefreplace')), context) for (elt, context) in elts] nltk-3.1/nltk/corpus/reader/ycoe.py0000644000076500000240000002416712607224144017126 0ustar sbstaff00000000000000# -*- coding: iso-8859-1 -*- # Natural Language Toolkit: York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE) # # Copyright (C) 2001-2015 NLTK Project # Author: Selina Dennis # URL: # For license information, see LICENSE.TXT """ Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE), a 1.5 million word syntactically-annotated corpus of Old English prose texts. The corpus is distributed by the Oxford Text Archive: http://www.ota.ahds.ac.uk/ It is not included with NLTK. The YCOE corpus is divided into 100 files, each representing an Old English prose text. Tags used within each text complies to the YCOE standard: http://www-users.york.ac.uk/~lang22/YCOE/YcoeHome.htm """ import os import re from nltk import compat from nltk.tokenize import RegexpTokenizer from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader from nltk.corpus.reader.tagged import TaggedCorpusReader from nltk.corpus.reader.util import * from nltk.corpus.reader.api import * class YCOECorpusReader(CorpusReader): """ Corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE), a 1.5 million word syntactically-annotated corpus of Old English prose texts. """ def __init__(self, root, encoding='utf8'): CorpusReader.__init__(self, root, [], encoding) self._psd_reader = YCOEParseCorpusReader( self.root.join('psd'), '.*', '.psd', encoding=encoding) self._pos_reader = YCOETaggedCorpusReader( self.root.join('pos'), '.*', '.pos') # Make sure we have a consistent set of items: documents = set(f[:-4] for f in self._psd_reader.fileids()) if set(f[:-4] for f in self._pos_reader.fileids()) != documents: raise ValueError('Items in "psd" and "pos" ' 'subdirectories do not match.') fileids = sorted(['%s.psd' % doc for doc in documents] + ['%s.pos' % doc for doc in documents]) CorpusReader.__init__(self, root, fileids, encoding) self._documents = sorted(documents) def documents(self, fileids=None): """ Return a list of document identifiers for all documents in this corpus, or for the documents with the given file(s) if specified. """ if fileids is None: return self._documents if isinstance(fileids, compat.string_types): fileids = [fileids] for f in fileids: if f not in self._fileids: raise KeyError('File id %s not found' % fileids) # Strip off the '.pos' and '.psd' extensions. return sorted(set(f[:-4] for f in fileids)) def fileids(self, documents=None): """ Return a list of file identifiers for the files that make up this corpus, or that store the given document(s) if specified. """ if documents is None: return self._fileids elif isinstance(documents, compat.string_types): documents = [documents] return sorted(set(['%s.pos' % doc for doc in documents] + ['%s.psd' % doc for doc in documents])) def _getfileids(self, documents, subcorpus): """ Helper that selects the appropriate fileids for a given set of documents from a given subcorpus (pos or psd). """ if documents is None: documents = self._documents else: if isinstance(documents, compat.string_types): documents = [documents] for document in documents: if document not in self._documents: if document[-4:] in ('.pos', '.psd'): raise ValueError( 'Expected a document identifier, not a file ' 'identifier. (Use corpus.documents() to get ' 'a list of document identifiers.') else: raise ValueError('Document identifier %s not found' % document) return ['%s.%s' % (d, subcorpus) for d in documents] # Delegate to one of our two sub-readers: def words(self, documents=None): return self._pos_reader.words(self._getfileids(documents, 'pos')) def sents(self, documents=None): return self._pos_reader.sents(self._getfileids(documents, 'pos')) def paras(self, documents=None): return self._pos_reader.paras(self._getfileids(documents, 'pos')) def tagged_words(self, documents=None): return self._pos_reader.tagged_words(self._getfileids(documents, 'pos')) def tagged_sents(self, documents=None): return self._pos_reader.tagged_sents(self._getfileids(documents, 'pos')) def tagged_paras(self, documents=None): return self._pos_reader.tagged_paras(self._getfileids(documents, 'pos')) def parsed_sents(self, documents=None): return self._psd_reader.parsed_sents(self._getfileids(documents, 'psd')) class YCOEParseCorpusReader(BracketParseCorpusReader): """Specialized version of the standard bracket parse corpus reader that strips out (CODE ...) and (ID ...) nodes.""" def _parse(self, t): t = re.sub(r'(?u)\((CODE|ID)[^\)]*\)', '', t) if re.match(r'\s*\(\s*\)\s*$', t): return None return BracketParseCorpusReader._parse(self, t) class YCOETaggedCorpusReader(TaggedCorpusReader): def __init__(self, root, items, encoding='utf8'): gaps_re = r'(?u)(?<=/\.)\s+|\s*\S*_CODE\s*|\s*\S*_ID\s*' sent_tokenizer = RegexpTokenizer(gaps_re, gaps=True) TaggedCorpusReader.__init__(self, root, items, sep='_', sent_tokenizer=sent_tokenizer) #: A list of all documents and their titles in ycoe. documents = { 'coadrian.o34': 'Adrian and Ritheus', 'coaelhom.o3': 'Ælfric, Supplemental Homilies', 'coaelive.o3': 'Ælfric\'s Lives of Saints', 'coalcuin': 'Alcuin De virtutibus et vitiis', 'coalex.o23': 'Alexander\'s Letter to Aristotle', 'coapollo.o3': 'Apollonius of Tyre', 'coaugust': 'Augustine', 'cobede.o2': 'Bede\'s History of the English Church', 'cobenrul.o3': 'Benedictine Rule', 'coblick.o23': 'Blickling Homilies', 'coboeth.o2': 'Boethius\' Consolation of Philosophy', 'cobyrhtf.o3': 'Byrhtferth\'s Manual', 'cocanedgD': 'Canons of Edgar (D)', 'cocanedgX': 'Canons of Edgar (X)', 'cocathom1.o3': 'Ælfric\'s Catholic Homilies I', 'cocathom2.o3': 'Ælfric\'s Catholic Homilies II', 'cochad.o24': 'Saint Chad', 'cochdrul': 'Chrodegang of Metz, Rule', 'cochristoph': 'Saint Christopher', 'cochronA.o23': 'Anglo-Saxon Chronicle A', 'cochronC': 'Anglo-Saxon Chronicle C', 'cochronD': 'Anglo-Saxon Chronicle D', 'cochronE.o34': 'Anglo-Saxon Chronicle E', 'cocura.o2': 'Cura Pastoralis', 'cocuraC': 'Cura Pastoralis (Cotton)', 'codicts.o34': 'Dicts of Cato', 'codocu1.o1': 'Documents 1 (O1)', 'codocu2.o12': 'Documents 2 (O1/O2)', 'codocu2.o2': 'Documents 2 (O2)', 'codocu3.o23': 'Documents 3 (O2/O3)', 'codocu3.o3': 'Documents 3 (O3)', 'codocu4.o24': 'Documents 4 (O2/O4)', 'coeluc1': 'Honorius of Autun, Elucidarium 1', 'coeluc2': 'Honorius of Autun, Elucidarium 1', 'coepigen.o3': 'Ælfric\'s Epilogue to Genesis', 'coeuphr': 'Saint Euphrosyne', 'coeust': 'Saint Eustace and his companions', 'coexodusP': 'Exodus (P)', 'cogenesiC': 'Genesis (C)', 'cogregdC.o24': 'Gregory\'s Dialogues (C)', 'cogregdH.o23': 'Gregory\'s Dialogues (H)', 'coherbar': 'Pseudo-Apuleius, Herbarium', 'coinspolD.o34': 'Wulfstan\'s Institute of Polity (D)', 'coinspolX': 'Wulfstan\'s Institute of Polity (X)', 'cojames': 'Saint James', 'colacnu.o23': 'Lacnunga', 'colaece.o2': 'Leechdoms', 'colaw1cn.o3': 'Laws, Cnut I', 'colaw2cn.o3': 'Laws, Cnut II', 'colaw5atr.o3': 'Laws, Æthelred V', 'colaw6atr.o3': 'Laws, Æthelred VI', 'colawaf.o2': 'Laws, Alfred', 'colawafint.o2': 'Alfred\'s Introduction to Laws', 'colawger.o34': 'Laws, Gerefa', 'colawine.ox2': 'Laws, Ine', 'colawnorthu.o3': 'Northumbra Preosta Lagu', 'colawwllad.o4': 'Laws, William I, Lad', 'coleofri.o4': 'Leofric', 'colsigef.o3': 'Ælfric\'s Letter to Sigefyrth', 'colsigewB': 'Ælfric\'s Letter to Sigeweard (B)', 'colsigewZ.o34': 'Ælfric\'s Letter to Sigeweard (Z)', 'colwgeat': 'Ælfric\'s Letter to Wulfgeat', 'colwsigeT': 'Ælfric\'s Letter to Wulfsige (T)', 'colwsigeXa.o34': 'Ælfric\'s Letter to Wulfsige (Xa)', 'colwstan1.o3': 'Ælfric\'s Letter to Wulfstan I', 'colwstan2.o3': 'Ælfric\'s Letter to Wulfstan II', 'comargaC.o34': 'Saint Margaret (C)', 'comargaT': 'Saint Margaret (T)', 'comart1': 'Martyrology, I', 'comart2': 'Martyrology, II', 'comart3.o23': 'Martyrology, III', 'comarvel.o23': 'Marvels of the East', 'comary': 'Mary of Egypt', 'coneot': 'Saint Neot', 'conicodA': 'Gospel of Nicodemus (A)', 'conicodC': 'Gospel of Nicodemus (C)', 'conicodD': 'Gospel of Nicodemus (D)', 'conicodE': 'Gospel of Nicodemus (E)', 'coorosiu.o2': 'Orosius', 'cootest.o3': 'Heptateuch', 'coprefcath1.o3': 'Ælfric\'s Preface to Catholic Homilies I', 'coprefcath2.o3': 'Ælfric\'s Preface to Catholic Homilies II', 'coprefcura.o2': 'Preface to the Cura Pastoralis', 'coprefgen.o3': 'Ælfric\'s Preface to Genesis', 'copreflives.o3': 'Ælfric\'s Preface to Lives of Saints', 'coprefsolilo': 'Preface to Augustine\'s Soliloquies', 'coquadru.o23': 'Pseudo-Apuleius, Medicina de quadrupedibus', 'corood': 'History of the Holy Rood-Tree', 'cosevensl': 'Seven Sleepers', 'cosolilo': 'St. Augustine\'s Soliloquies', 'cosolsat1.o4': 'Solomon and Saturn I', 'cosolsat2': 'Solomon and Saturn II', 'cotempo.o3': 'Ælfric\'s De Temporibus Anni', 'coverhom': 'Vercelli Homilies', 'coverhomE': 'Vercelli Homilies (E)', 'coverhomL': 'Vercelli Homilies (L)', 'covinceB': 'Saint Vincent (Bodley 343)', 'covinsal': 'Vindicta Salvatoris', 'cowsgosp.o3': 'West-Saxon Gospels', 'cowulf.o34': 'Wulfstan\'s Homilies' } nltk-3.1/nltk/corpus/util.py0000644000076500000240000001130712607224144015672 0ustar sbstaff00000000000000# Natural Language Toolkit: Corpus Reader Utility Functions # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT ###################################################################### #{ Lazy Corpus Loader ###################################################################### from __future__ import unicode_literals import re import gc import nltk from nltk.compat import python_2_unicode_compatible TRY_ZIPFILE_FIRST = False @python_2_unicode_compatible class LazyCorpusLoader(object): """ To see the API documentation for this lazily loaded corpus, first run corpus.ensure_loaded(), and then run help(this_corpus). LazyCorpusLoader is a proxy object which is used to stand in for a corpus object before the corpus is loaded. This allows NLTK to create an object for each corpus, but defer the costs associated with loading those corpora until the first time that they're actually accessed. The first time this object is accessed in any way, it will load the corresponding corpus, and transform itself into that corpus (by modifying its own ``__class__`` and ``__dict__`` attributes). If the corpus can not be found, then accessing this object will raise an exception, displaying installation instructions for the NLTK data package. Once they've properly installed the data package (or modified ``nltk.data.path`` to point to its location), they can then use the corpus object without restarting python. """ def __init__(self, name, reader_cls, *args, **kwargs): from nltk.corpus.reader.api import CorpusReader assert issubclass(reader_cls, CorpusReader) self.__name = self.__name__ = name self.__reader_cls = reader_cls self.__args = args self.__kwargs = kwargs def __load(self): # Find the corpus root directory. zip_name = re.sub(r'(([^/]*)(/.*)?)', r'\2.zip/\1/', self.__name) if TRY_ZIPFILE_FIRST: try: root = nltk.data.find('corpora/%s' % zip_name) except LookupError as e: try: root = nltk.data.find('corpora/%s' % self.__name) except LookupError: raise e else: try: root = nltk.data.find('corpora/%s' % self.__name) except LookupError as e: try: root = nltk.data.find('corpora/%s' % zip_name) except LookupError: raise e # Load the corpus. corpus = self.__reader_cls(root, *self.__args, **self.__kwargs) # This is where the magic happens! Transform ourselves into # the corpus by modifying our own __dict__ and __class__ to # match that of the corpus. args, kwargs = self.__args, self.__kwargs name, reader_cls = self.__name, self.__reader_cls self.__dict__ = corpus.__dict__ self.__class__ = corpus.__class__ # _unload support: assign __dict__ and __class__ back, then do GC. # after reassigning __dict__ there shouldn't be any references to # corpus data so the memory should be deallocated after gc.collect() def _unload(self): lazy_reader = LazyCorpusLoader(name, reader_cls, *args, **kwargs) self.__dict__ = lazy_reader.__dict__ self.__class__ = lazy_reader.__class__ gc.collect() self._unload = _make_bound_method(_unload, self) def __getattr__(self, attr): # Fix for inspect.isclass under Python 2.6 # (see http://bugs.python.org/issue1225107). # Without this fix tests may take extra 1.5GB RAM # because all corpora gets loaded during test collection. if attr == '__bases__': raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'") self.__load() # This looks circular, but its not, since __load() changes our # __class__ to something new: return getattr(self, attr) def __repr__(self): return '<%s in %r (not loaded yet)>' % ( self.__reader_cls.__name__, '.../corpora/'+self.__name) def _unload(self): # If an exception occures during corpus loading then # '_unload' method may be unattached, so __getattr__ can be called; # we shouldn't trigger corpus loading again in this case. pass def _make_bound_method(func, self): """ Magic for creating bound methods (used for _unload). """ class Foo(object): def meth(self): pass f = Foo() bound_method = type(f.meth) try: return bound_method(func, self, self.__class__) except TypeError: # python3 return bound_method(func, self) nltk-3.1/nltk/data.py0000644000076500000240000015016312607224144014317 0ustar sbstaff00000000000000# Natural Language Toolkit: Utility functions # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Functions to find and load NLTK resource files, such as corpora, grammars, and saved processing objects. Resource files are identified using URLs, such as ``nltk:corpora/abc/rural.txt`` or ``http://nltk.org/sample/toy.cfg``. The following URL protocols are supported: - ``file:path``: Specifies the file whose path is *path*. Both relative and absolute paths may be used. - ``http://host/path``: Specifies the file stored on the web server *host* at path *path*. - ``nltk:path``: Specifies the file stored in the NLTK data package at *path*. NLTK will search for these files in the directories specified by ``nltk.data.path``. If no protocol is specified, then the default protocol ``nltk:`` will be used. This module provides to functions that can be used to access a resource file, given its URL: ``load()`` loads a given resource, and adds it to a resource cache; and ``retrieve()`` copies a given resource to a local file. """ from __future__ import print_function, unicode_literals from __future__ import division import sys import io import os import textwrap import re import zipfile import codecs from gzip import GzipFile, READ as GZ_READ, WRITE as GZ_WRITE try: from zlib import Z_SYNC_FLUSH as FLUSH except ImportError: from zlib import Z_FINISH as FLUSH try: import cPickle as pickle except ImportError: import pickle # this import should be more specific: import nltk from nltk.compat import py3_data, add_py3_data from nltk.compat import text_type, string_types, BytesIO, urlopen, url2pathname ###################################################################### # Search Path ###################################################################### path = [] """A list of directories where the NLTK data package might reside. These directories will be checked in order when looking for a resource in the data package. Note that this allows users to substitute in their own versions of resources, if they have them (e.g., in their home directory under ~/nltk_data).""" # User-specified locations: _paths_from_env = os.environ.get('NLTK_DATA', str('')).split(os.pathsep) path += [d for d in _paths_from_env if d] if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/': path.append(os.path.expanduser(str('~/nltk_data'))) if sys.platform.startswith('win'): # Common locations on Windows: path += [ str(r'C:\nltk_data'), str(r'D:\nltk_data'), str(r'E:\nltk_data'), os.path.join(sys.prefix, str('nltk_data')), os.path.join(sys.prefix, str('lib'), str('nltk_data')), os.path.join( os.environ.get(str('APPDATA'), str('C:\\')), str('nltk_data')) ] else: # Common locations on UNIX & OS X: path += [ str('/usr/share/nltk_data'), str('/usr/local/share/nltk_data'), str('/usr/lib/nltk_data'), str('/usr/local/lib/nltk_data') ] ###################################################################### # Util Functions ###################################################################### def gzip_open_unicode(filename, mode="rb", compresslevel=9, encoding='utf-8', fileobj=None, errors=None, newline=None): if fileobj is None: fileobj = GzipFile(filename, mode, compresslevel, fileobj) return io.TextIOWrapper(fileobj, encoding, errors, newline) def split_resource_url(resource_url): """ Splits a resource url into ":". >>> windows = sys.platform.startswith('win') >>> split_resource_url('nltk:home/nltk') ('nltk', 'home/nltk') >>> split_resource_url('nltk:/home/nltk') ('nltk', '/home/nltk') >>> split_resource_url('file:/home/nltk') ('file', '/home/nltk') >>> split_resource_url('file:///home/nltk') ('file', '/home/nltk') >>> split_resource_url('file:///C:/home/nltk') ('file', '/C:/home/nltk') """ protocol, path_ = resource_url.split(':', 1) if protocol == 'nltk': pass elif protocol == 'file': if path_.startswith('/'): path_ = '/' + path_.lstrip('/') else: path_ = re.sub(r'^/{0,2}', '', path_) return protocol, path_ def normalize_resource_url(resource_url): r""" Normalizes a resource url >>> windows = sys.platform.startswith('win') >>> os.path.normpath(split_resource_url(normalize_resource_url('file:grammar.fcfg'))[1]) == \ ... ('\\' if windows else '') + os.path.abspath(os.path.join(os.curdir, 'grammar.fcfg')) True >>> not windows or normalize_resource_url('file:C:/dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('file:C:\\dir\\file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('file:C:\\dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('file://C:/dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('file:////C:/dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('nltk:C:/dir/file') == 'file:///C:/dir/file' True >>> not windows or normalize_resource_url('nltk:C:\\dir\\file') == 'file:///C:/dir/file' True >>> windows or normalize_resource_url('file:/dir/file/toy.cfg') == 'file:///dir/file/toy.cfg' True >>> normalize_resource_url('nltk:home/nltk') 'nltk:home/nltk' >>> windows or normalize_resource_url('nltk:/home/nltk') == 'file:///home/nltk' True >>> normalize_resource_url('http://example.com/dir/file') 'http://example.com/dir/file' >>> normalize_resource_url('dir/file') 'nltk:dir/file' """ try: protocol, name = split_resource_url(resource_url) except ValueError: # the resource url has no protocol, use the nltk protocol by default protocol = 'nltk' name = resource_url # use file protocol if the path is an absolute path if protocol == 'nltk' and os.path.isabs(name): protocol = 'file://' name = normalize_resource_name(name, False, None) elif protocol == 'file': protocol = 'file://' # name is absolute name = normalize_resource_name(name, False, None) elif protocol == 'nltk': protocol = 'nltk:' name = normalize_resource_name(name, True) else: # handled by urllib protocol += '://' return ''.join([protocol, name]) def normalize_resource_name(resource_name, allow_relative=True, relative_path=None): """ :type resource_name: str or unicode :param resource_name: The name of the resource to search for. Resource names are posix-style relative path names, such as ``corpora/brown``. Directory names will automatically be converted to a platform-appropriate path separator. Directory trailing slashes are preserved >>> windows = sys.platform.startswith('win') >>> normalize_resource_name('.', True) './' >>> normalize_resource_name('./', True) './' >>> windows or normalize_resource_name('dir/file', False, '/') == '/dir/file' True >>> not windows or normalize_resource_name('C:/file', False, '/') == '/C:/file' True >>> windows or normalize_resource_name('/dir/file', False, '/') == '/dir/file' True >>> windows or normalize_resource_name('../dir/file', False, '/') == '/dir/file' True >>> not windows or normalize_resource_name('/dir/file', True, '/') == 'dir/file' True >>> windows or normalize_resource_name('/dir/file', True, '/') == '/dir/file' True """ is_dir = bool(re.search(r'[\\/.]$', resource_name)) or resource_name.endswith(os.path.sep) if sys.platform.startswith('win'): resource_name = resource_name.lstrip('/') else: resource_name = re.sub(r'^/+', '/', resource_name) if allow_relative: resource_name = os.path.normpath(resource_name) else: if relative_path is None: relative_path = os.curdir resource_name = os.path.abspath( os.path.join(relative_path, resource_name)) resource_name = resource_name.replace('\\', '/').replace(os.path.sep, '/') if sys.platform.startswith('win') and os.path.isabs(resource_name): resource_name = '/' + resource_name if is_dir and not resource_name.endswith('/'): resource_name += '/' return resource_name ###################################################################### # Path Pointers ###################################################################### class PathPointer(object): """ An abstract base class for 'path pointers,' used by NLTK's data package to identify specific paths. Two subclasses exist: ``FileSystemPathPointer`` identifies a file that can be accessed directly via a given absolute path. ``ZipFilePathPointer`` identifies a file contained within a zipfile, that can be accessed by reading that zipfile. """ def open(self, encoding=None): """ Return a seekable read-only stream that can be used to read the contents of the file identified by this path pointer. :raise IOError: If the path specified by this pointer does not contain a readable file. """ raise NotImplementedError('abstract base class') def file_size(self): """ Return the size of the file pointed to by this path pointer, in bytes. :raise IOError: If the path specified by this pointer does not contain a readable file. """ raise NotImplementedError('abstract base class') def join(self, fileid): """ Return a new path pointer formed by starting at the path identified by this pointer, and then following the relative path given by ``fileid``. The path components of ``fileid`` should be separated by forward slashes, regardless of the underlying file system's path seperator character. """ raise NotImplementedError('abstract base class') class FileSystemPathPointer(PathPointer, text_type): """ A path pointer that identifies a file which can be accessed directly via a given absolute path. """ @py3_data def __init__(self, _path): """ Create a new path pointer for the given absolute path. :raise IOError: If the given path does not exist. """ _path = os.path.abspath(_path) if not os.path.exists(_path): raise IOError('No such file or directory: %r' % _path) self._path = _path # There's no need to call str.__init__(), since it's a no-op; # str does all of its setup work in __new__. @property def path(self): """The absolute path identified by this path pointer.""" return self._path def open(self, encoding=None): stream = open(self._path, 'rb') if encoding is not None: stream = SeekableUnicodeStreamReader(stream, encoding) return stream def file_size(self): return os.stat(self._path).st_size def join(self, fileid): _path = os.path.join(self._path, fileid) return FileSystemPathPointer(_path) def __repr__(self): # This should be a byte string under Python 2.x; # we don't want transliteration here so # @python_2_unicode_compatible is not used. return str('FileSystemPathPointer(%r)' % self._path) def __str__(self): return self._path class BufferedGzipFile(GzipFile): """ A ``GzipFile`` subclass that buffers calls to ``read()`` and ``write()``. This allows faster reads and writes of data to and from gzip-compressed files at the cost of using more memory. The default buffer size is 2MB. ``BufferedGzipFile`` is useful for loading large gzipped pickle objects as well as writing large encoded feature files for classifier training. """ MB = 2 ** 20 SIZE = 2 * MB @py3_data def __init__(self, filename=None, mode=None, compresslevel=9, fileobj=None, **kwargs): """ Return a buffered gzip file object. :param filename: a filesystem path :type filename: str :param mode: a file mode which can be any of 'r', 'rb', 'a', 'ab', 'w', or 'wb' :type mode: str :param compresslevel: The compresslevel argument is an integer from 1 to 9 controlling the level of compression; 1 is fastest and produces the least compression, and 9 is slowest and produces the most compression. The default is 9. :type compresslevel: int :param fileobj: a BytesIO stream to read from instead of a file. :type fileobj: BytesIO :param size: number of bytes to buffer during calls to read() and write() :type size: int :rtype: BufferedGzipFile """ GzipFile.__init__(self, filename, mode, compresslevel, fileobj) self._size = kwargs.get('size', self.SIZE) self._buffer = BytesIO() # cStringIO does not support len. self._len = 0 def _reset_buffer(self): # For some reason calling BytesIO.truncate() here will lead to # inconsistent writes so just set _buffer to a new BytesIO object. self._buffer = BytesIO() self._len = 0 def _write_buffer(self, data): # Simply write to the buffer and increment the buffer size. if data is not None: self._buffer.write(data) self._len += len(data) def _write_gzip(self, data): # Write the current buffer to the GzipFile. GzipFile.write(self, self._buffer.getvalue()) # Then reset the buffer and write the new data to the buffer. self._reset_buffer() self._write_buffer(data) def close(self): # GzipFile.close() doesn't actuallly close anything. if self.mode == GZ_WRITE: self._write_gzip(None) self._reset_buffer() return GzipFile.close(self) def flush(self, lib_mode=FLUSH): self._buffer.flush() GzipFile.flush(self, lib_mode) def read(self, size=None): if not size: size = self._size contents = BytesIO() while True: blocks = GzipFile.read(self, size) if not blocks: contents.flush() break contents.write(blocks) return contents.getvalue() else: return GzipFile.read(self, size) def write(self, data, size=-1): """ :param data: bytes to write to file or buffer :type data: bytes :param size: buffer at least size bytes before writing to file :type size: int """ if not size: size = self._size if self._len + len(data) <= size: self._write_buffer(data) else: self._write_gzip(data) class GzipFileSystemPathPointer(FileSystemPathPointer): """ A subclass of ``FileSystemPathPointer`` that identifies a gzip-compressed file located at a given absolute path. ``GzipFileSystemPathPointer`` is appropriate for loading large gzip-compressed pickle objects efficiently. """ def open(self, encoding=None): stream = BufferedGzipFile(self._path, 'rb') if encoding: stream = SeekableUnicodeStreamReader(stream, encoding) return stream class ZipFilePathPointer(PathPointer): """ A path pointer that identifies a file contained within a zipfile, which can be accessed by reading that zipfile. """ @py3_data def __init__(self, zipfile, entry=''): """ Create a new path pointer pointing at the specified entry in the given zipfile. :raise IOError: If the given zipfile does not exist, or if it does not contain the specified entry. """ if isinstance(zipfile, string_types): zipfile = OpenOnDemandZipFile(os.path.abspath(zipfile)) # Normalize the entry string, it should be relative: entry = normalize_resource_name(entry, True, '/').lstrip('/') # Check that the entry exists: if entry: try: zipfile.getinfo(entry) except Exception: # Sometimes directories aren't explicitly listed in # the zip file. So if `entry` is a directory name, # then check if the zipfile contains any files that # are under the given directory. if (entry.endswith('/') and [n for n in zipfile.namelist() if n.startswith(entry)]): pass # zipfile contains a file in that directory. else: # Otherwise, complain. raise IOError('Zipfile %r does not contain %r' % (zipfile.filename, entry)) self._zipfile = zipfile self._entry = entry @property def zipfile(self): """ The zipfile.ZipFile object used to access the zip file containing the entry identified by this path pointer. """ return self._zipfile @property def entry(self): """ The name of the file within zipfile that this path pointer points to. """ return self._entry def open(self, encoding=None): data = self._zipfile.read(self._entry) stream = BytesIO(data) if self._entry.endswith('.gz'): stream = BufferedGzipFile(self._entry, fileobj=stream) elif encoding is not None: stream = SeekableUnicodeStreamReader(stream, encoding) return stream def file_size(self): return self._zipfile.getinfo(self._entry).file_size def join(self, fileid): entry = '%s/%s' % (self._entry, fileid) return ZipFilePathPointer(self._zipfile, entry) def __repr__(self): return str('ZipFilePathPointer(%r, %r)') % ( self._zipfile.filename, self._entry) def __str__(self): return os.path.normpath(os.path.join(self._zipfile.filename, self._entry)) ###################################################################### # Access Functions ###################################################################### # Don't use a weak dictionary, because in the common case this # causes a lot more reloading that necessary. _resource_cache = {} """A dictionary used to cache resources so that they won't need to be loaded more than once.""" def find(resource_name, paths=None): """ Find the given resource by searching through the directories and zip files in paths, where a None or empty string specifies an absolute path. Returns a corresponding path name. If the given resource is not found, raise a ``LookupError``, whose message gives a pointer to the installation instructions for the NLTK downloader. Zip File Handling: - If ``resource_name`` contains a component with a ``.zip`` extension, then it is assumed to be a zipfile; and the remaining path components are used to look inside the zipfile. - If any element of ``nltk.data.path`` has a ``.zip`` extension, then it is assumed to be a zipfile. - If a given resource name that does not contain any zipfile component is not found initially, then ``find()`` will make a second attempt to find that resource, by replacing each component *p* in the path with *p.zip/p*. For example, this allows ``find()`` to map the resource name ``corpora/chat80/cities.pl`` to a zip file path pointer to ``corpora/chat80.zip/chat80/cities.pl``. - When using ``find()`` to locate a directory contained in a zipfile, the resource name must end with the forward slash character. Otherwise, ``find()`` will not locate the directory. :type resource_name: str or unicode :param resource_name: The name of the resource to search for. Resource names are posix-style relative path names, such as ``corpora/brown``. Directory names will be automatically converted to a platform-appropriate path separator. :rtype: str """ resource_name = normalize_resource_name(resource_name, True) # Resolve default paths at runtime in-case the user overrides # nltk.data.path if paths is None: paths = path # Check if the resource name includes a zipfile name m = re.match(r'(.*\.zip)/?(.*)$|', resource_name) zipfile, zipentry = m.groups() # Check each item in our path for path_ in paths: # Is the path item a zipfile? if path_ and (os.path.isfile(path_) and path_.endswith('.zip')): try: return ZipFilePathPointer(path_, resource_name) except IOError: # resource not in zipfile continue # Is the path item a directory or is resource_name an absolute path? elif not path_ or os.path.isdir(path_): if zipfile is None: p = os.path.join(path_, url2pathname(resource_name)) if os.path.exists(p): if p.endswith('.gz'): return GzipFileSystemPathPointer(p) else: return FileSystemPathPointer(p) else: p = os.path.join(path_, url2pathname(zipfile)) if os.path.exists(p): try: return ZipFilePathPointer(p, zipentry) except IOError: # resource not in zipfile continue # Fallback: if the path doesn't include a zip file, then try # again, assuming that one of the path components is inside a # zipfile of the same name. if zipfile is None: pieces = resource_name.split('/') for i in range(len(pieces)): modified_name = '/'.join(pieces[:i] + [pieces[i] + '.zip'] + pieces[i:]) try: return find(modified_name, paths) except LookupError: pass # Display a friendly error message if the resource wasn't found: msg = textwrap.fill( 'Resource %r not found. Please use the NLTK Downloader to ' 'obtain the resource: >>> nltk.download()' % (resource_name,), initial_indent=' ', subsequent_indent=' ', width=66) msg += '\n Searched in:' + ''.join('\n - %r' % d for d in paths) sep = '*' * 70 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep) raise LookupError(resource_not_found) def retrieve(resource_url, filename=None, verbose=True): """ Copy the given resource to a local file. If no filename is specified, then use the URL's filename. If there is already a file named ``filename``, then raise a ``ValueError``. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. """ resource_url = normalize_resource_url(resource_url) if filename is None: if resource_url.startswith('file:'): filename = os.path.split(resource_url)[-1] else: filename = re.sub(r'(^\w+:)?.*/', '', resource_url) if os.path.exists(filename): filename = os.path.abspath(filename) raise ValueError("File %r already exists!" % filename) if verbose: print('Retrieving %r, saving to %r' % (resource_url, filename)) # Open the input & output streams. infile = _open(resource_url) # Copy infile -> outfile, using 64k blocks. with open(filename, "wb") as outfile: while True: s = infile.read(1024 * 64) # 64k blocks. outfile.write(s) if not s: break infile.close() #: A dictionary describing the formats that are supported by NLTK's #: load() method. Keys are format names, and values are format #: descriptions. FORMATS = { 'pickle': "A serialized python object, stored using the pickle module.", 'json': "A serialized python object, stored using the json module.", 'yaml': "A serialized python object, stored using the yaml module.", 'cfg': "A context free grammar.", 'pcfg': "A probabilistic CFG.", 'fcfg': "A feature CFG.", 'fol': "A list of first order logic expressions, parsed with " "nltk.sem.logic.Expression.fromstring.", 'logic': "A list of first order logic expressions, parsed with " "nltk.sem.logic.LogicParser. Requires an additional logic_parser " "parameter", 'val': "A semantic valuation, parsed by nltk.sem.Valuation.fromstring.", 'raw': "The raw (byte string) contents of a file.", 'text': "The raw (unicode string) contents of a file. " } #: A dictionary mapping from file extensions to format names, used #: by load() when format="auto" to decide the format for a #: given resource url. AUTO_FORMATS = { 'pickle': 'pickle', 'json': 'json', 'yaml': 'yaml', 'cfg': 'cfg', 'pcfg': 'pcfg', 'fcfg': 'fcfg', 'fol': 'fol', 'logic': 'logic', 'val': 'val', 'txt': 'text', 'text': 'text', } def load(resource_url, format='auto', cache=True, verbose=False, logic_parser=None, fstruct_reader=None, encoding=None): """ Load a given resource from the NLTK data package. The following resource formats are currently supported: - ``pickle`` - ``json`` - ``yaml`` - ``cfg`` (context free grammars) - ``pcfg`` (probabilistic CFGs) - ``fcfg`` (feature-based CFGs) - ``fol`` (formulas of First Order Logic) - ``logic`` (Logical formulas to be parsed by the given logic_parser) - ``val`` (valuation of First Order Logic model) - ``text`` (the file contents as a unicode string) - ``raw`` (the raw file contents as a byte string) If no format is specified, ``load()`` will attempt to determine a format based on the resource name's file extension. If that fails, ``load()`` will raise a ``ValueError`` exception. For all text formats (everything except ``pickle``, ``json``, ``yaml`` and ``raw``), it tries to decode the raw contents using UTF-8, and if that doesn't work, it tries with ISO-8859-1 (Latin-1), unless the ``encoding`` is specified. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. :type cache: bool :param cache: If true, add this resource to a cache. If load() finds a resource in its cache, then it will return it from the cache rather than loading it. The cache uses weak references, so a resource wil automatically be expunged from the cache when no more objects are using it. :type verbose: bool :param verbose: If true, print a message when loading a resource. Messages are not displayed when a resource is retrieved from the cache. :type logic_parser: LogicParser :param logic_parser: The parser that will be used to parse logical expressions. :type fstruct_reader: FeatStructReader :param fstruct_reader: The parser that will be used to parse the feature structure of an fcfg. :type encoding: str :param encoding: the encoding of the input; only used for text formats. """ resource_url = normalize_resource_url(resource_url) resource_url = add_py3_data(resource_url) # Determine the format of the resource. if format == 'auto': resource_url_parts = resource_url.split('.') ext = resource_url_parts[-1] if ext == 'gz': ext = resource_url_parts[-2] format = AUTO_FORMATS.get(ext) if format is None: raise ValueError('Could not determine format for %s based ' 'on its file\nextension; use the "format" ' 'argument to specify the format explicitly.' % resource_url) if format not in FORMATS: raise ValueError('Unknown format type: %s!' % (format,)) # If we've cached the resource, then just return it. if cache: resource_val = _resource_cache.get((resource_url, format)) if resource_val is not None: if verbose: print('<>' % (resource_url,)) return resource_val # Let the user know what's going on. if verbose: print('<>' % (resource_url,)) # Load the resource. opened_resource = _open(resource_url) if format == 'raw': resource_val = opened_resource.read() elif format == 'pickle': resource_val = pickle.load(opened_resource) elif format == 'json': import json from nltk.jsontags import json_tags resource_val = json.load(opened_resource) tag = None if len(resource_val) != 1: tag = next(resource_val.keys()) if tag not in json_tags: raise ValueError('Unknown json tag.') elif format == 'yaml': import yaml resource_val = yaml.load(opened_resource) else: # The resource is a text format. binary_data = opened_resource.read() if encoding is not None: string_data = binary_data.decode(encoding) else: try: string_data = binary_data.decode('utf-8') except UnicodeDecodeError: string_data = binary_data.decode('latin-1') if format == 'text': resource_val = string_data elif format == 'cfg': resource_val = nltk.grammar.CFG.fromstring( string_data, encoding=encoding) elif format == 'pcfg': resource_val = nltk.grammar.PCFG.fromstring( string_data, encoding=encoding) elif format == 'fcfg': resource_val = nltk.grammar.FeatureGrammar.fromstring( string_data, logic_parser=logic_parser, fstruct_reader=fstruct_reader, encoding=encoding) elif format == 'fol': resource_val = nltk.sem.read_logic( string_data, logic_parser=nltk.sem.logic.LogicParser(), encoding=encoding) elif format == 'logic': resource_val = nltk.sem.read_logic( string_data, logic_parser=logic_parser, encoding=encoding) elif format == 'val': resource_val = nltk.sem.read_valuation( string_data, encoding=encoding) else: raise AssertionError("Internal NLTK error: Format %s isn't " "handled by nltk.data.load()" % (format,)) opened_resource.close() # If requested, add it to the cache. if cache: try: _resource_cache[(resource_url, format)] = resource_val # TODO: add this line # print('<>' % (resource_url,)) except TypeError: # We can't create weak references to some object types, like # strings and tuples. For now, just don't cache them. pass return resource_val def show_cfg(resource_url, escape='##'): """ Write out a grammar file, ignoring escaped and empty lines. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. :type escape: str :param escape: Prepended string that signals lines to be ignored """ resource_url = normalize_resource_url(resource_url) resource_val = load(resource_url, format='text', cache=False) lines = resource_val.splitlines() for l in lines: if l.startswith(escape): continue if re.match('^$', l): continue print(l) def clear_cache(): """ Remove all objects from the resource cache. :see: load() """ _resource_cache.clear() def _open(resource_url): """ Helper function that returns an open file object for a resource, given its resource URL. If the given resource URL uses the "nltk:" protocol, or uses no protocol, then use ``nltk.data.find`` to find its path, and open it with the given mode; if the resource URL uses the 'file' protocol, then open the file with the given mode; otherwise, delegate to ``urllib2.urlopen``. :type resource_url: str :param resource_url: A URL specifying where the resource should be loaded from. The default protocol is "nltk:", which searches for the file in the the NLTK data package. """ resource_url = normalize_resource_url(resource_url) protocol, path_ = split_resource_url(resource_url) if protocol is None or protocol.lower() == 'nltk': return find(path_, path + ['']).open() elif protocol.lower() == 'file': # urllib might not use mode='rb', so handle this one ourselves: return find(path_, ['']).open() else: return urlopen(resource_url) ###################################################################### # Lazy Resource Loader ###################################################################### # We shouldn't apply @python_2_unicode_compatible # decorator to LazyLoader, this is resource.__class__ responsibility. class LazyLoader(object): @py3_data def __init__(self, _path): self._path = _path def __load(self): resource = load(self._path) # This is where the magic happens! Transform ourselves into # the object by modifying our own __dict__ and __class__ to # match that of `resource`. self.__dict__ = resource.__dict__ self.__class__ = resource.__class__ def __getattr__(self, attr): self.__load() # This looks circular, but its not, since __load() changes our # __class__ to something new: return getattr(self, attr) def __repr__(self): self.__load() # This looks circular, but its not, since __load() changes our # __class__ to something new: return repr(self) ###################################################################### # Open-On-Demand ZipFile ###################################################################### class OpenOnDemandZipFile(zipfile.ZipFile): """ A subclass of ``zipfile.ZipFile`` that closes its file pointer whenever it is not using it; and re-opens it when it needs to read data from the zipfile. This is useful for reducing the number of open file handles when many zip files are being accessed at once. ``OpenOnDemandZipFile`` must be constructed from a filename, not a file-like object (to allow re-opening). ``OpenOnDemandZipFile`` is read-only (i.e. ``write()`` and ``writestr()`` are disabled. """ @py3_data def __init__(self, filename): if not isinstance(filename, string_types): raise TypeError('ReopenableZipFile filename must be a string') zipfile.ZipFile.__init__(self, filename) assert self.filename == filename self.close() def read(self, name): assert self.fp is None self.fp = open(self.filename, 'rb') value = zipfile.ZipFile.read(self, name) self.close() return value def write(self, *args, **kwargs): """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" raise NotImplementedError('OpenOnDemandZipfile is read-only') def writestr(self, *args, **kwargs): """:raise NotImplementedError: OpenOnDemandZipfile is read-only""" raise NotImplementedError('OpenOnDemandZipfile is read-only') def __repr__(self): return repr(str('OpenOnDemandZipFile(%r)') % self.filename) ###################################################################### #{ Seekable Unicode Stream Reader ###################################################################### class SeekableUnicodeStreamReader(object): """ A stream reader that automatically encodes the source byte stream into unicode (like ``codecs.StreamReader``); but still supports the ``seek()`` and ``tell()`` operations correctly. This is in contrast to ``codecs.StreamReader``, which provide *broken* ``seek()`` and ``tell()`` methods. This class was motivated by ``StreamBackedCorpusView``, which makes extensive use of ``seek()`` and ``tell()``, and needs to be able to handle unicode-encoded files. Note: this class requires stateless decoders. To my knowledge, this shouldn't cause a problem with any of python's builtin unicode encodings. """ DEBUG = True # : If true, then perform extra sanity checks. @py3_data def __init__(self, stream, encoding, errors='strict'): # Rewind the stream to its beginning. stream.seek(0) self.stream = stream """The underlying stream.""" self.encoding = encoding """The name of the encoding that should be used to encode the underlying stream.""" self.errors = errors """The error mode that should be used when decoding data from the underlying stream. Can be 'strict', 'ignore', or 'replace'.""" self.decode = codecs.getdecoder(encoding) """The function that is used to decode byte strings into unicode strings.""" self.bytebuffer = b'' """A buffer to use bytes that have been read but have not yet been decoded. This is only used when the final bytes from a read do not form a complete encoding for a character.""" self.linebuffer = None """A buffer used by ``readline()`` to hold characters that have been read, but have not yet been returned by ``read()`` or ``readline()``. This buffer consists of a list of unicode strings, where each string corresponds to a single line. The final element of the list may or may not be a complete line. Note that the existence of a linebuffer makes the ``tell()`` operation more complex, because it must backtrack to the beginning of the buffer to determine the correct file position in the underlying byte stream.""" self._rewind_checkpoint = 0 """The file position at which the most recent read on the underlying stream began. This is used, together with ``_rewind_numchars``, to backtrack to the beginning of ``linebuffer`` (which is required by ``tell()``).""" self._rewind_numchars = None """The number of characters that have been returned since the read that started at ``_rewind_checkpoint``. This is used, together with ``_rewind_checkpoint``, to backtrack to the beginning of ``linebuffer`` (which is required by ``tell()``).""" self._bom = self._check_bom() """The length of the byte order marker at the beginning of the stream (or None for no byte order marker).""" #///////////////////////////////////////////////////////////////// # Read methods #///////////////////////////////////////////////////////////////// def read(self, size=None): """ Read up to ``size`` bytes, decode them using this reader's encoding, and return the resulting unicode string. :param size: The maximum number of bytes to read. If not specified, then read as many bytes as possible. :type size: int :rtype: unicode """ chars = self._read(size) # If linebuffer is not empty, then include it in the result if self.linebuffer: chars = ''.join(self.linebuffer) + chars self.linebuffer = None self._rewind_numchars = None return chars def readline(self, size=None): """ Read a line of text, decode it using this reader's encoding, and return the resulting unicode string. :param size: The maximum number of bytes to read. If no newline is encountered before ``size`` bytes have been read, then the returned value may not be a complete line of text. :type size: int """ # If we have a non-empty linebuffer, then return the first # line from it. (Note that the last element of linebuffer may # not be a complete line; so let _read() deal with it.) if self.linebuffer and len(self.linebuffer) > 1: line = self.linebuffer.pop(0) self._rewind_numchars += len(line) return line readsize = size or 72 chars = '' # If there's a remaining incomplete line in the buffer, add it. if self.linebuffer: chars += self.linebuffer.pop() self.linebuffer = None while True: startpos = self.stream.tell() - len(self.bytebuffer) new_chars = self._read(readsize) # If we're at a '\r', then read one extra character, since # it might be a '\n', to get the proper line ending. if new_chars and new_chars.endswith('\r'): new_chars += self._read(1) chars += new_chars lines = chars.splitlines(True) if len(lines) > 1: line = lines[0] self.linebuffer = lines[1:] self._rewind_numchars = (len(new_chars) - (len(chars) - len(line))) self._rewind_checkpoint = startpos break elif len(lines) == 1: line0withend = lines[0] line0withoutend = lines[0].splitlines(False)[0] if line0withend != line0withoutend: # complete line line = line0withend break if not new_chars or size is not None: line = chars break # Read successively larger blocks of text. if readsize < 8000: readsize *= 2 return line def readlines(self, sizehint=None, keepends=True): """ Read this file's contents, decode them using this reader's encoding, and return it as a list of unicode lines. :rtype: list(unicode) :param sizehint: Ignored. :param keepends: If false, then strip newlines. """ return self.read().splitlines(keepends) def next(self): """Return the next decoded line from the underlying stream.""" line = self.readline() if line: return line else: raise StopIteration def __next__(self): return self.next() def __iter__(self): """Return self""" return self def xreadlines(self): """Return self""" return self #///////////////////////////////////////////////////////////////// # Pass-through methods & properties #///////////////////////////////////////////////////////////////// @property def closed(self): """True if the underlying stream is closed.""" return self.stream.closed @property def name(self): """The name of the underlying stream.""" return self.stream.name @property def mode(self): """The mode of the underlying stream.""" return self.stream.mode def close(self): """ Close the underlying stream. """ self.stream.close() #///////////////////////////////////////////////////////////////// # Seek and tell #///////////////////////////////////////////////////////////////// def seek(self, offset, whence=0): """ Move the stream to a new file position. If the reader is maintaining any buffers, then they will be cleared. :param offset: A byte count offset. :param whence: If 0, then the offset is from the start of the file (offset should be positive), if 1, then the offset is from the current position (offset may be positive or negative); and if 2, then the offset is from the end of the file (offset should typically be negative). """ if whence == 1: raise ValueError('Relative seek is not supported for ' 'SeekableUnicodeStreamReader -- consider ' 'using char_seek_forward() instead.') self.stream.seek(offset, whence) self.linebuffer = None self.bytebuffer = b'' self._rewind_numchars = None self._rewind_checkpoint = self.stream.tell() def char_seek_forward(self, offset): """ Move the read pointer forward by ``offset`` characters. """ if offset < 0: raise ValueError('Negative offsets are not supported') # Clear all buffers. self.seek(self.tell()) # Perform the seek operation. self._char_seek_forward(offset) def _char_seek_forward(self, offset, est_bytes=None): """ Move the file position forward by ``offset`` characters, ignoring all buffers. :param est_bytes: A hint, giving an estimate of the number of bytes that will be neded to move forward by ``offset`` chars. Defaults to ``offset``. """ if est_bytes is None: est_bytes = offset bytes = b'' while True: # Read in a block of bytes. newbytes = self.stream.read(est_bytes - len(bytes)) bytes += newbytes # Decode the bytes to characters. chars, bytes_decoded = self._incr_decode(bytes) # If we got the right number of characters, then seek # backwards over any truncated characters, and return. if len(chars) == offset: self.stream.seek(-len(bytes) + bytes_decoded, 1) return # If we went too far, then we can back-up until we get it # right, using the bytes we've already read. if len(chars) > offset: while len(chars) > offset: # Assume at least one byte/char. est_bytes += offset - len(chars) chars, bytes_decoded = self._incr_decode(bytes[:est_bytes]) self.stream.seek(-len(bytes) + bytes_decoded, 1) return # Otherwise, we haven't read enough bytes yet; loop again. est_bytes += offset - len(chars) def tell(self): """ Return the current file position on the underlying byte stream. If this reader is maintaining any buffers, then the returned file position will be the position of the beginning of those buffers. """ # If nothing's buffered, then just return our current filepos: if self.linebuffer is None: return self.stream.tell() - len(self.bytebuffer) # Otherwise, we'll need to backtrack the filepos until we # reach the beginning of the buffer. # Store our original file position, so we can return here. orig_filepos = self.stream.tell() # Calculate an estimate of where we think the newline is. bytes_read = ((orig_filepos - len(self.bytebuffer)) - self._rewind_checkpoint) buf_size = sum(len(line) for line in self.linebuffer) est_bytes = int((bytes_read * self._rewind_numchars / (self._rewind_numchars + buf_size))) self.stream.seek(self._rewind_checkpoint) self._char_seek_forward(self._rewind_numchars, est_bytes) filepos = self.stream.tell() # Sanity check if self.DEBUG: self.stream.seek(filepos) check1 = self._incr_decode(self.stream.read(50))[0] check2 = ''.join(self.linebuffer) assert check1.startswith(check2) or check2.startswith(check1) # Return to our original filepos (so we don't have to throw # out our buffer.) self.stream.seek(orig_filepos) # Return the calculated filepos return filepos #///////////////////////////////////////////////////////////////// # Helper methods #///////////////////////////////////////////////////////////////// def _read(self, size=None): """ Read up to ``size`` bytes from the underlying stream, decode them using this reader's encoding, and return the resulting unicode string. ``linebuffer`` is not included in the result. """ if size == 0: return '' # Skip past the byte order marker, if present. if self._bom and self.stream.tell() == 0: self.stream.read(self._bom) # Read the requested number of bytes. if size is None: new_bytes = self.stream.read() else: new_bytes = self.stream.read(size) bytes = self.bytebuffer + new_bytes # Decode the bytes into unicode characters chars, bytes_decoded = self._incr_decode(bytes) # If we got bytes but couldn't decode any, then read further. if (size is not None) and (not chars) and (len(new_bytes) > 0): while not chars: new_bytes = self.stream.read(1) if not new_bytes: break # end of file. bytes += new_bytes chars, bytes_decoded = self._incr_decode(bytes) # Record any bytes we didn't consume. self.bytebuffer = bytes[bytes_decoded:] # Return the result return chars def _incr_decode(self, bytes): """ Decode the given byte string into a unicode string, using this reader's encoding. If an exception is encountered that appears to be caused by a truncation error, then just decode the byte string without the bytes that cause the trunctaion error. Return a tuple ``(chars, num_consumed)``, where ``chars`` is the decoded unicode string, and ``num_consumed`` is the number of bytes that were consumed. """ while True: try: return self.decode(bytes, 'strict') except UnicodeDecodeError as exc: # If the exception occurs at the end of the string, # then assume that it's a truncation error. if exc.end == len(bytes): return self.decode(bytes[:exc.start], self.errors) # Otherwise, if we're being strict, then raise it. elif self.errors == 'strict': raise # If we're not strict, then re-process it with our # errors setting. This *may* raise an exception. else: return self.decode(bytes, self.errors) _BOM_TABLE = { 'utf8': [(codecs.BOM_UTF8, None)], 'utf16': [(codecs.BOM_UTF16_LE, 'utf16-le'), (codecs.BOM_UTF16_BE, 'utf16-be')], 'utf16le': [(codecs.BOM_UTF16_LE, None)], 'utf16be': [(codecs.BOM_UTF16_BE, None)], 'utf32': [(codecs.BOM_UTF32_LE, 'utf32-le'), (codecs.BOM_UTF32_BE, 'utf32-be')], 'utf32le': [(codecs.BOM_UTF32_LE, None)], 'utf32be': [(codecs.BOM_UTF32_BE, None)], } def _check_bom(self): # Normalize our encoding name enc = re.sub('[ -]', '', self.encoding.lower()) # Look up our encoding in the BOM table. bom_info = self._BOM_TABLE.get(enc) if bom_info: # Read a prefix, to check against the BOM(s) bytes = self.stream.read(16) self.stream.seek(0) # Check for each possible BOM. for (bom, new_encoding) in bom_info: if bytes.startswith(bom): if new_encoding: self.encoding = new_encoding return len(bom) return None __all__ = ['path', 'PathPointer', 'FileSystemPathPointer', 'BufferedGzipFile', 'GzipFileSystemPathPointer', 'GzipFileSystemPathPointer', 'find', 'retrieve', 'FORMATS', 'AUTO_FORMATS', 'load', 'show_cfg', 'clear_cache', 'LazyLoader', 'OpenOnDemandZipFile', 'GzipFileSystemPathPointer', 'SeekableUnicodeStreamReader'] nltk-3.1/nltk/decorators.py0000644000076500000240000001670712607224144015560 0ustar sbstaff00000000000000""" Decorator module by Michele Simionato Copyright Michele Simionato, distributed under the terms of the BSD License (see below). http://www.phyast.pitt.edu/~micheles/python/documentation.html Included in NLTK for its support of a nice memoization decorator. """ from __future__ import print_function __docformat__ = 'restructuredtext en' ## The basic trick is to generate the source code for the decorated function ## with the right signature and to evaluate it. ## Uncomment the statement 'print >> sys.stderr, func_src' in _decorator ## to understand what is going on. __all__ = ["decorator", "new_wrapper", "getinfo"] import sys # Hack to keep NLTK's "tokenize" module from colliding with the "tokenize" in # the Python standard library. old_sys_path = sys.path[:] sys.path = [p for p in sys.path if "nltk" not in p] import inspect sys.path = old_sys_path try: set except NameError: from sets import Set as set def getinfo(func): """ Returns an info dictionary containing: - name (the name of the function : str) - argnames (the names of the arguments : list) - defaults (the values of the default arguments : tuple) - signature (the signature : str) - doc (the docstring : str) - module (the module name : str) - dict (the function __dict__ : str) >>> def f(self, x=1, y=2, *args, **kw): pass >>> info = getinfo(f) >>> info["name"] 'f' >>> info["argnames"] ['self', 'x', 'y', 'args', 'kw'] >>> info["defaults"] (1, 2) >>> info["signature"] 'self, x, y, *args, **kw' """ assert inspect.ismethod(func) or inspect.isfunction(func) regargs, varargs, varkwargs, defaults = inspect.getargspec(func) argnames = list(regargs) if varargs: argnames.append(varargs) if varkwargs: argnames.append(varkwargs) signature = inspect.formatargspec(regargs, varargs, varkwargs, defaults, formatvalue=lambda value: "")[1:-1] # pypy compatibility if hasattr(func, '__closure__'): _closure = func.__closure__ _globals = func.__globals__ else: _closure = func.func_closure _globals = func.func_globals return dict(name=func.__name__, argnames=argnames, signature=signature, defaults = func.__defaults__, doc=func.__doc__, module=func.__module__, dict=func.__dict__, globals=_globals, closure=_closure) # akin to functools.update_wrapper def update_wrapper(wrapper, model, infodict=None): infodict = infodict or getinfo(model) wrapper.__name__ = infodict['name'] wrapper.__doc__ = infodict['doc'] wrapper.__module__ = infodict['module'] wrapper.__dict__.update(infodict['dict']) wrapper.__defaults__ = infodict['defaults'] wrapper.undecorated = model return wrapper def new_wrapper(wrapper, model): """ An improvement over functools.update_wrapper. The wrapper is a generic callable object. It works by generating a copy of the wrapper with the right signature and by updating the copy, not the original. Moreovoer, 'model' can be a dictionary with keys 'name', 'doc', 'module', 'dict', 'defaults'. """ if isinstance(model, dict): infodict = model else: # assume model is a function infodict = getinfo(model) assert not '_wrapper_' in infodict["argnames"], ( '"_wrapper_" is a reserved argument name!') src = "lambda %(signature)s: _wrapper_(%(signature)s)" % infodict funcopy = eval(src, dict(_wrapper_=wrapper)) return update_wrapper(funcopy, model, infodict) # helper used in decorator_factory def __call__(self, func): return new_wrapper(lambda *a, **k : self.call(func, *a, **k), func) def decorator_factory(cls): """ Take a class with a ``.caller`` method and return a callable decorator object. It works by adding a suitable __call__ method to the class; it raises a TypeError if the class already has a nontrivial __call__ method. """ attrs = set(dir(cls)) if '__call__' in attrs: raise TypeError('You cannot decorate a class with a nontrivial ' '__call__ method') if 'call' not in attrs: raise TypeError('You cannot decorate a class without a ' '.call method') cls.__call__ = __call__ return cls def decorator(caller): """ General purpose decorator factory: takes a caller function as input and returns a decorator with the same attributes. A caller function is any function like this:: def caller(func, *args, **kw): # do something return func(*args, **kw) Here is an example of usage: >>> @decorator ... def chatty(f, *args, **kw): ... print("Calling %r" % f.__name__) ... return f(*args, **kw) >>> chatty.__name__ 'chatty' >>> @chatty ... def f(): pass ... >>> f() Calling 'f' decorator can also take in input a class with a .caller method; in this case it converts the class into a factory of callable decorator objects. See the documentation for an example. """ if inspect.isclass(caller): return decorator_factory(caller) def _decorator(func): # the real meat is here infodict = getinfo(func) argnames = infodict['argnames'] assert not ('_call_' in argnames or '_func_' in argnames), ( 'You cannot use _call_ or _func_ as argument names!') src = "lambda %(signature)s: _call_(_func_, %(signature)s)" % infodict # import sys; print >> sys.stderr, src # for debugging purposes dec_func = eval(src, dict(_func_=func, _call_=caller)) return update_wrapper(dec_func, func, infodict) return update_wrapper(_decorator, caller) def getattr_(obj, name, default_thunk): "Similar to .setdefault in dictionaries." try: return getattr(obj, name) except AttributeError: default = default_thunk() setattr(obj, name, default) return default @decorator def memoize(func, *args): dic = getattr_(func, "memoize_dic", dict) # memoize_dic is created at the first call if args in dic: return dic[args] else: result = func(*args) dic[args] = result return result ########################## LEGALESE ############################### ## Redistributions of source code must retain the above copyright ## notice, this list of conditions and the following disclaimer. ## Redistributions in bytecode form must reproduce the above copyright ## notice, this list of conditions and the following disclaimer in ## the documentation and/or other materials provided with the ## distribution. ## THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ## "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT ## LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR ## A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT ## HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, ## INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, ## BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS ## OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ## ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR ## TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ## USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH ## DAMAGE. nltk-3.1/nltk/downloader.py0000644000076500000240000026327712607224144015557 0ustar sbstaff00000000000000# Natural Language Toolkit: Corpus & Model Downloader # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ The NLTK corpus and module downloader. This module defines several interfaces which can be used to download corpora, models, and other data packages that can be used with NLTK. Downloading Packages ==================== If called with no arguments, ``download()`` will display an interactive interface which can be used to download and install new packages. If Tkinter is available, then a graphical interface will be shown, otherwise a simple text interface will be provided. Individual packages can be downloaded by calling the ``download()`` function with a single argument, giving the package identifier for the package that should be downloaded: >>> download('treebank') # doctest: +SKIP [nltk_data] Downloading package 'treebank'... [nltk_data] Unzipping corpora/treebank.zip. NLTK also provides a number of \"package collections\", consisting of a group of related packages. To download all packages in a colleciton, simply call ``download()`` with the collection's identifier: >>> download('all-corpora') # doctest: +SKIP [nltk_data] Downloading package 'abc'... [nltk_data] Unzipping corpora/abc.zip. [nltk_data] Downloading package 'alpino'... [nltk_data] Unzipping corpora/alpino.zip. ... [nltk_data] Downloading package 'words'... [nltk_data] Unzipping corpora/words.zip. Download Directory ================== By default, packages are installed in either a system-wide directory (if Python has sufficient access to write to it); or in the current user's home directory. However, the ``download_dir`` argument may be used to specify a different installation target, if desired. See ``Downloader.default_download_dir()`` for more a detailed description of how the default download directory is chosen. NLTK Download Server ==================== Before downloading any packages, the corpus and module downloader contacts the NLTK download server, to retrieve an index file describing the available packages. By default, this index file is loaded from ``http://www.nltk.org/nltk_data/``. If necessary, it is possible to create a new ``Downloader`` object, specifying a different URL for the package index file. Usage:: python nltk/downloader.py [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS or:: python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS """ #---------------------------------------------------------------------- from __future__ import print_function, division, unicode_literals """ 0 1 2 3 [label][----][label][----] [column ][column ] Notes ===== Handling data files.. Some questions: * Should the data files be kept zipped or unzipped? I say zipped. * Should the data files be kept in svn at all? Advantages: history; automatic version numbers; 'svn up' could be used rather than the downloader to update the corpora. Disadvantages: they're big, which makes working from svn a bit of a pain. And we're planning to potentially make them much bigger. I don't think we want people to have to download 400MB corpora just to use nltk from svn. * Compromise: keep the data files in trunk/data rather than in trunk/nltk. That way you can check them out in svn if you want to; but you don't need to, and you can use the downloader instead. * Also: keep models in mind. When we change the code, we'd potentially like the models to get updated. This could require a little thought. * So.. let's assume we have a trunk/data directory, containing a bunch of packages. The packages should be kept as zip files, because we really shouldn't be editing them much (well -- we may edit models more, but they tend to be binary-ish files anyway, where diffs aren't that helpful). So we'll have trunk/data, with a bunch of files like abc.zip and treebank.zip and propbank.zip. For each package we could also have eg treebank.xml and propbank.xml, describing the contents of the package (name, copyright, license, etc). Collections would also have .xml files. Finally, we would pull all these together to form a single index.xml file. Some directory structure wouldn't hurt. So how about:: /trunk/data/ ....................... root of data svn index.xml ........................ main index file src/ ............................. python scripts packages/ ........................ dir for packages corpora/ ....................... zip & xml files for corpora grammars/ ...................... zip & xml files for grammars taggers/ ....................... zip & xml files for taggers tokenizers/ .................... zip & xml files for tokenizers etc. collections/ ..................... xml files for collections Where the root (/trunk/data) would contain a makefile; and src/ would contain a script to update the info.xml file. It could also contain scripts to rebuild some of the various model files. The script that builds index.xml should probably check that each zip file expands entirely into a single subdir, whose name matches the package's uid. Changes I need to make: - in index: change "size" to "filesize" or "compressed-size" - in index: add "unzipped-size" - when checking status: check both compressed & uncompressed size. uncompressed size is important to make sure we detect a problem if something got partially unzipped. define new status values to differentiate stale vs corrupt vs corruptly-uncompressed?? (we shouldn't need to re-download the file if the zip file is ok but it didn't get uncompressed fully.) - add other fields to the index: author, license, copyright, contact, etc. the current grammars/ package would become a single new package (eg toy-grammars or book-grammars). xml file should have: - authorship info - license info - copyright info - contact info - info about what type of data/annotation it contains? - recommended corpus reader? collections can contain other collections. they can also contain multiple package types (corpora & models). Have a single 'basics' package that includes everything we talk about in the book? n.b.: there will have to be a fallback to the punkt tokenizer, in case they didn't download that model. default: unzip or not? """ import time, os, zipfile, sys, textwrap, threading, itertools from hashlib import md5 try: TKINTER = True from tkinter import (Tk, Frame, Label, Entry, Button, Canvas, Menu, IntVar, TclError) from tkinter.messagebox import showerror from nltk.draw.table import Table from nltk.draw.util import ShowText except: TKINTER = False TclError = ValueError from xml.etree import ElementTree import nltk from nltk import compat #urllib2 = nltk.internals.import_from_stdlib('urllib2') ###################################################################### # Directory entry objects (from the data server's index file) ###################################################################### @compat.python_2_unicode_compatible class Package(object): """ A directory entry for a downloadable package. These entries are extracted from the XML index file that is downloaded by ``Downloader``. Each package consists of a single file; but if that file is a zip file, then it can be automatically decompressed when the package is installed. """ def __init__(self, id, url, name=None, subdir='', size=None, unzipped_size=None, checksum=None, svn_revision=None, copyright='Unknown', contact='Unknown', license='Unknown', author='Unknown', unzip=True, **kw): self.id = id """A unique identifier for this package.""" self.name = name or id """A string name for this package.""" self.subdir = subdir """The subdirectory where this package should be installed. E.g., ``'corpora'`` or ``'taggers'``.""" self.url = url """A URL that can be used to download this package's file.""" self.size = int(size) """The filesize (in bytes) of the package file.""" self.unzipped_size = int(unzipped_size) """The total filesize of the files contained in the package's zipfile.""" self.checksum = checksum """The MD-5 checksum of the package file.""" self.svn_revision = svn_revision """A subversion revision number for this package.""" self.copyright = copyright """Copyright holder for this package.""" self.contact = contact """Name & email of the person who should be contacted with questions about this package.""" self.license = license """License information for this package.""" self.author = author """Author of this package.""" ext = os.path.splitext(url.split('/')[-1])[1] self.filename = os.path.join(subdir, id+ext) """The filename that should be used for this package's file. It is formed by joining ``self.subdir`` with ``self.id``, and using the same extension as ``url``.""" self.unzip = bool(int(unzip)) # '0' or '1' """A flag indicating whether this corpus should be unzipped by default.""" # Include any other attributes provided by the XML file. self.__dict__.update(kw) @staticmethod def fromxml(xml): if isinstance(xml, compat.string_types): xml = ElementTree.parse(xml) for key in xml.attrib: xml.attrib[key] = compat.text_type(xml.attrib[key]) return Package(**xml.attrib) def __lt__(self, other): return self.id < other.id def __repr__(self): return '' % self.id @compat.python_2_unicode_compatible class Collection(object): """ A directory entry for a collection of downloadable packages. These entries are extracted from the XML index file that is downloaded by ``Downloader``. """ def __init__(self, id, children, name=None, **kw): self.id = id """A unique identifier for this collection.""" self.name = name or id """A string name for this collection.""" self.children = children """A list of the ``Collections`` or ``Packages`` directly contained by this collection.""" self.packages = None """A list of ``Packages`` contained by this collection or any collections it recursively contains.""" # Include any other attributes provided by the XML file. self.__dict__.update(kw) @staticmethod def fromxml(xml): if isinstance(xml, compat.string_types): xml = ElementTree.parse(xml) for key in xml.attrib: xml.attrib[key] = compat.text_type(xml.attrib[key]) children = [child.get('ref') for child in xml.findall('item')] return Collection(children=children, **xml.attrib) def __lt__(self, other): return self.id < other.id def __repr__(self): return '' % self.id ###################################################################### # Message Passing Objects ###################################################################### class DownloaderMessage(object): """A status message object, used by ``incr_download`` to communicate its progress.""" class StartCollectionMessage(DownloaderMessage): """Data server has started working on a collection of packages.""" def __init__(self, collection): self.collection = collection class FinishCollectionMessage(DownloaderMessage): """Data server has finished working on a collection of packages.""" def __init__(self, collection): self.collection = collection class StartPackageMessage(DownloaderMessage): """Data server has started working on a package.""" def __init__(self, package): self.package = package class FinishPackageMessage(DownloaderMessage): """Data server has finished working on a package.""" def __init__(self, package): self.package = package class StartDownloadMessage(DownloaderMessage): """Data server has started downloading a package.""" def __init__(self, package): self.package = package class FinishDownloadMessage(DownloaderMessage): """Data server has finished downloading a package.""" def __init__(self, package): self.package = package class StartUnzipMessage(DownloaderMessage): """Data server has started unzipping a package.""" def __init__(self, package): self.package = package class FinishUnzipMessage(DownloaderMessage): """Data server has finished unzipping a package.""" def __init__(self, package): self.package = package class UpToDateMessage(DownloaderMessage): """The package download file is already up-to-date""" def __init__(self, package): self.package = package class StaleMessage(DownloaderMessage): """The package download file is out-of-date or corrupt""" def __init__(self, package): self.package = package class ErrorMessage(DownloaderMessage): """Data server encountered an error""" def __init__(self, package, message): self.package = package if isinstance(message, Exception): self.message = str(message) else: self.message = message class ProgressMessage(DownloaderMessage): """Indicates how much progress the data server has made""" def __init__(self, progress): self.progress = progress class SelectDownloadDirMessage(DownloaderMessage): """Indicates what download directory the data server is using""" def __init__(self, download_dir): self.download_dir = download_dir ###################################################################### # NLTK Data Server ###################################################################### class Downloader(object): """ A class used to access the NLTK data server, which can be used to download corpora and other data packages. """ #///////////////////////////////////////////////////////////////// # Configuration #///////////////////////////////////////////////////////////////// INDEX_TIMEOUT = 60*60 # 1 hour """The amount of time after which the cached copy of the data server index will be considered 'stale,' and will be re-downloaded.""" DEFAULT_URL = 'http://www.nltk.org/nltk_data/' """The default URL for the NLTK data server's index. An alternative URL can be specified when creating a new ``Downloader`` object.""" #///////////////////////////////////////////////////////////////// # Status Constants #///////////////////////////////////////////////////////////////// INSTALLED = 'installed' """A status string indicating that a package or collection is installed and up-to-date.""" NOT_INSTALLED = 'not installed' """A status string indicating that a package or collection is not installed.""" STALE = 'out of date' """A status string indicating that a package or collection is corrupt or out-of-date.""" PARTIAL = 'partial' """A status string indicating that a collection is partially installed (i.e., only some of its packages are installed.)""" #///////////////////////////////////////////////////////////////// # Cosntructor #///////////////////////////////////////////////////////////////// def __init__(self, server_index_url=None, download_dir=None): self._url = server_index_url or self.DEFAULT_URL """The URL for the data server's index file.""" self._collections = {} """Dictionary from collection identifier to ``Collection``""" self._packages = {} """Dictionary from package identifier to ``Package``""" self._download_dir = download_dir """The default directory to which packages will be downloaded.""" self._index = None """The XML index file downloaded from the data server""" self._index_timestamp = None """Time at which ``self._index`` was downloaded. If it is more than ``INDEX_TIMEOUT`` seconds old, it will be re-downloaded.""" self._status_cache = {} """Dictionary from package/collection identifier to status string (``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``). Cache is used for packages only, not collections.""" self._errors = None """Flag for telling if all packages got successfully downloaded or not.""" # decide where we're going to save things to. if self._download_dir is None: self._download_dir = self.default_download_dir() #///////////////////////////////////////////////////////////////// # Information #///////////////////////////////////////////////////////////////// def list(self, download_dir=None, show_packages=True, show_collections=True, header=True, more_prompt=False, skip_installed=False): lines = 0 # for more_prompt if download_dir is None: download_dir = self._download_dir print('Using default data directory (%s)' % download_dir) if header: print('='*(26+len(self._url))) print(' Data server index for <%s>' % self._url) print('='*(26+len(self._url))) lines += 3 # for more_prompt stale = partial = False categories = [] if show_packages: categories.append('packages') if show_collections: categories.append('collections') for category in categories: print('%s:' % category.capitalize()) lines += 1 # for more_prompt for info in sorted(getattr(self, category)(), key=str): status = self.status(info, download_dir) if status == self.INSTALLED and skip_installed: continue if status == self.STALE: stale = True if status == self.PARTIAL: partial = True prefix = {self.INSTALLED:'*', self.STALE:'-', self.PARTIAL:'P', self.NOT_INSTALLED: ' '}[status] name = textwrap.fill('-'*27 + (info.name or info.id), 75, subsequent_indent=27*' ')[27:] print(' [%s] %s %s' % (prefix, info.id.ljust(20, '.'), name)) lines += len(name.split('\n')) # for more_prompt if more_prompt and lines > 20: user_input = compat.raw_input("Hit Enter to continue: ") if (user_input.lower() in ('x', 'q')): return lines = 0 print() msg = '([*] marks installed packages' if stale: msg += '; [-] marks out-of-date or corrupt packages' if partial: msg += '; [P] marks partially installed collections' print(textwrap.fill(msg+')', subsequent_indent=' ', width=76)) def packages(self): self._update_index() return self._packages.values() def corpora(self): self._update_index() return [pkg for (id,pkg) in self._packages.items() if pkg.subdir == 'corpora'] def models(self): self._update_index() return [pkg for (id,pkg) in self._packages.items() if pkg.subdir != 'corpora'] def collections(self): self._update_index() return self._collections.values() #///////////////////////////////////////////////////////////////// # Downloading #///////////////////////////////////////////////////////////////// def _info_or_id(self, info_or_id): if isinstance(info_or_id, compat.string_types): return self.info(info_or_id) else: return info_or_id # [xx] When during downloading is it 'safe' to abort? Only unsafe # time is *during* an unzip -- we don't want to leave a # partially-unzipped corpus in place because we wouldn't notice # it. But if we had the exact total size of the unzipped corpus, # then that would be fine. Then we could abort anytime we want! # So this is really what we should do. That way the threaded # downloader in the gui can just kill the download thread anytime # it wants. def incr_download(self, info_or_id, download_dir=None, force=False): # If they didn't specify a download_dir, then use the default one. if download_dir is None: download_dir = self._download_dir yield SelectDownloadDirMessage(download_dir) # If they gave us a list of ids, then download each one. if isinstance(info_or_id, (list,tuple)): for msg in self._download_list(info_or_id, download_dir, force): yield msg return # Look up the requested collection or package. try: info = self._info_or_id(info_or_id) except (IOError, ValueError) as e: yield ErrorMessage(None, 'Error loading %s: %s' % (info_or_id, e)) return # Handle collections. if isinstance(info, Collection): yield StartCollectionMessage(info) for msg in self.incr_download(info.children, download_dir, force): yield msg yield FinishCollectionMessage(info) # Handle Packages (delegate to a helper function). else: for msg in self._download_package(info, download_dir, force): yield msg def _num_packages(self, item): if isinstance(item, Package): return 1 else: return len(item.packages) def _download_list(self, items, download_dir, force): # Look up the requested items. for i in range(len(items)): try: items[i] = self._info_or_id(items[i]) except (IOError, ValueError) as e: yield ErrorMessage(items[i], e) return # Download each item, re-scaling their progress. num_packages = sum(self._num_packages(item) for item in items) progress = 0 for i, item in enumerate(items): if isinstance(item, Package): delta = 1./num_packages else: delta = float(len(item.packages))/num_packages for msg in self.incr_download(item, download_dir, force): if isinstance(msg, ProgressMessage): yield ProgressMessage(progress + msg.progress*delta) else: yield msg progress += 100*delta def _download_package(self, info, download_dir, force): yield StartPackageMessage(info) yield ProgressMessage(0) # Do we already have the current version? status = self.status(info, download_dir) if not force and status == self.INSTALLED: yield UpToDateMessage(info) yield ProgressMessage(100) yield FinishPackageMessage(info) return # Remove the package from our status cache self._status_cache.pop(info.id, None) # Check for (and remove) any old/stale version. filepath = os.path.join(download_dir, info.filename) if os.path.exists(filepath): if status == self.STALE: yield StaleMessage(info) os.remove(filepath) # Ensure the download_dir exists if not os.path.exists(download_dir): os.mkdir(download_dir) if not os.path.exists(os.path.join(download_dir, info.subdir)): os.mkdir(os.path.join(download_dir, info.subdir)) # Download the file. This will raise an IOError if the url # is not found. yield StartDownloadMessage(info) yield ProgressMessage(5) try: infile = compat.urlopen(info.url) with open(filepath, 'wb') as outfile: #print info.size num_blocks = max(1, float(info.size)/(1024*16)) for block in itertools.count(): s = infile.read(1024*16) # 16k blocks. outfile.write(s) if not s: break if block % 2 == 0: # how often? yield ProgressMessage(min(80, 5+75*(block/num_blocks))) infile.close() except IOError as e: yield ErrorMessage(info, 'Error downloading %r from <%s>:' '\n %s' % (info.id, info.url, e)) return yield FinishDownloadMessage(info) yield ProgressMessage(80) # If it's a zipfile, uncompress it. if info.filename.endswith('.zip'): zipdir = os.path.join(download_dir, info.subdir) # Unzip if we're unzipping by default; *or* if it's already # been unzipped (presumably a previous version). if info.unzip or os.path.exists(os.path.join(zipdir, info.id)): yield StartUnzipMessage(info) for msg in _unzip_iter(filepath, zipdir, verbose=False): # Somewhat of a hack, but we need a proper package reference msg.package = info yield msg yield FinishUnzipMessage(info) yield FinishPackageMessage(info) def download(self, info_or_id=None, download_dir=None, quiet=False, force=False, prefix='[nltk_data] ', halt_on_error=True, raise_on_error=False): # If no info or id is given, then use the interactive shell. if info_or_id is None: # [xx] hmm -- changing self._download_dir here seems like # the wrong thing to do. Maybe the _interactive_download # function should make a new copy of self to use? if download_dir is not None: self._download_dir = download_dir self._interactive_download() return True else: # Define a helper function for displaying output: def show(s, prefix2=''): print(textwrap.fill(s, initial_indent=prefix+prefix2, subsequent_indent=prefix+prefix2+' '*4)) for msg in self.incr_download(info_or_id, download_dir, force): # Error messages if isinstance(msg, ErrorMessage): show(msg.message) if raise_on_error: raise ValueError(msg.message) if halt_on_error: return False self._errors = True if not quiet: print("Error installing package. Retry? [n/y/e]") choice = compat.raw_input().strip() if choice in ['y', 'Y']: if not self.download(msg.package.id, download_dir, quiet, force, prefix, halt_on_error, raise_on_error): return False elif choice in ['e', 'E']: return False # All other messages if not quiet: # Collection downloading messages: if isinstance(msg, StartCollectionMessage): show('Downloading collection %r' % msg.collection.id) prefix += ' | ' print(prefix) elif isinstance(msg, FinishCollectionMessage): print(prefix) prefix = prefix[:-4] if self._errors: show('Downloaded collection %r with errors' % msg.collection.id) else: show('Done downloading collection %s' % msg.collection.id) # Package downloading messages: elif isinstance(msg, StartPackageMessage): show('Downloading package %s to %s...' % (msg.package.id, download_dir)) elif isinstance(msg, UpToDateMessage): show('Package %s is already up-to-date!' % msg.package.id, ' ') #elif isinstance(msg, StaleMessage): # show('Package %s is out-of-date or corrupt' % # msg.package.id, ' ') elif isinstance(msg, StartUnzipMessage): show('Unzipping %s.' % msg.package.filename, ' ') # Data directory message: elif isinstance(msg, SelectDownloadDirMessage): download_dir = msg.download_dir return True def is_stale(self, info_or_id, download_dir=None): return self.status(info_or_id, download_dir) == self.STALE def is_installed(self, info_or_id, download_dir=None): return self.status(info_or_id, download_dir) == self.INSTALLED def clear_status_cache(self, id=None): if id is None: self._status_cache.clear() else: self._status_cache.pop(id, None) def status(self, info_or_id, download_dir=None): """ Return a constant describing the status of the given package or collection. Status can be one of ``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``. """ if download_dir is None: download_dir = self._download_dir info = self._info_or_id(info_or_id) # Handle collections: if isinstance(info, Collection): pkg_status = [self.status(pkg.id) for pkg in info.packages] if self.STALE in pkg_status: return self.STALE elif self.PARTIAL in pkg_status: return self.PARTIAL elif (self.INSTALLED in pkg_status and self.NOT_INSTALLED in pkg_status): return self.PARTIAL elif self.NOT_INSTALLED in pkg_status: return self.NOT_INSTALLED else: return self.INSTALLED # Handle packages: else: filepath = os.path.join(download_dir, info.filename) if download_dir != self._download_dir: status = self._pkg_status(info, filepath) else: if info.id not in self._status_cache: self._status_cache[info.id] = self._pkg_status(info, filepath) return self._status_cache[info.id] def _pkg_status(self, info, filepath): if not os.path.exists(filepath): return self.NOT_INSTALLED # Check if the file has the correct size. try: filestat = os.stat(filepath) except OSError: return self.NOT_INSTALLED if filestat.st_size != int(info.size): return self.STALE # Check if the file's checksum matches if md5_hexdigest(filepath) != info.checksum: return self.STALE # If it's a zipfile, and it's been at least partially # unzipped, then check if it's been fully unzipped. if filepath.endswith('.zip'): unzipdir = filepath[:-4] if not os.path.exists(unzipdir): return self.INSTALLED # but not unzipped -- ok! if not os.path.isdir(unzipdir): return self.STALE unzipped_size = sum(os.stat(os.path.join(d, f)).st_size for d, _, files in os.walk(unzipdir) for f in files) if unzipped_size != info.unzipped_size: return self.STALE # Otherwise, everything looks good. return self.INSTALLED def update(self, quiet=False, prefix='[nltk_data] '): """ Re-download any packages whose status is STALE. """ self.clear_status_cache() for pkg in self.packages(): if self.status(pkg) == self.STALE: self.download(pkg, quiet=quiet, prefix=prefix) #///////////////////////////////////////////////////////////////// # Index #///////////////////////////////////////////////////////////////// def _update_index(self, url=None): """A helper function that ensures that self._index is up-to-date. If the index is older than self.INDEX_TIMEOUT, then download it again.""" # Check if the index is aleady up-to-date. If so, do nothing. if not (self._index is None or url is not None or time.time()-self._index_timestamp > self.INDEX_TIMEOUT): return # If a URL was specified, then update our URL. self._url = url or self._url # Download the index file. self._index = nltk.internals.ElementWrapper( ElementTree.parse(compat.urlopen(self._url)).getroot()) self._index_timestamp = time.time() # Build a dictionary of packages. packages = [Package.fromxml(p) for p in self._index.findall('packages/package')] self._packages = dict((p.id, p) for p in packages) # Build a dictionary of collections. collections = [Collection.fromxml(c) for c in self._index.findall('collections/collection')] self._collections = dict((c.id, c) for c in collections) # Replace identifiers with actual children in collection.children. for collection in self._collections.values(): for i, child_id in enumerate(collection.children): if child_id in self._packages: collection.children[i] = self._packages[child_id] elif child_id in self._collections: collection.children[i] = self._collections[child_id] else: print('removing collection member with no package: {}'.format(child_id)) del collection.children[i] # Fill in collection.packages for each collection. for collection in self._collections.values(): packages = {} queue = [collection] for child in queue: if isinstance(child, Collection): queue.extend(child.children) else: packages[child.id] = child collection.packages = packages.values() # Flush the status cache self._status_cache.clear() def index(self): """ Return the XML index describing the packages available from the data server. If necessary, this index will be downloaded from the data server. """ self._update_index() return self._index def info(self, id): """Return the ``Package`` or ``Collection`` record for the given item.""" self._update_index() if id in self._packages: return self._packages[id] if id in self._collections: return self._collections[id] raise ValueError('Package %r not found in index' % id) def xmlinfo(self, id): """Return the XML info record for the given item""" self._update_index() for package in self._index.findall('packages/package'): if package.get('id') == id: return package for collection in self._index.findall('collections/collection'): if collection.get('id') == id: return collection raise ValueError('Package %r not found in index' % id) #///////////////////////////////////////////////////////////////// # URL & Data Directory #///////////////////////////////////////////////////////////////// def _get_url(self): """The URL for the data server's index file.""" return self._url def _set_url(self, url): """ Set a new URL for the data server. If we're unable to contact the given url, then the original url is kept. """ original_url = self._url try: self._update_index(url) except: self._url = original_url raise url = property(_get_url, _set_url) def default_download_dir(self): """ Return the directory to which packages will be downloaded by default. This value can be overridden using the constructor, or on a case-by-case basis using the ``download_dir`` argument when calling ``download()``. On Windows, the default download directory is ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the directory containing Python, e.g. ``C:\\Python25``. On all other platforms, the default directory is the first of the following which exists or which can be created with write permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``, ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``. """ # Check if we are on GAE where we cannot write into filesystem. if 'APPENGINE_RUNTIME' in os.environ: return # Check if we have sufficient permissions to install in a # variety of system-wide locations. for nltkdir in nltk.data.path: if (os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir)): return nltkdir # On Windows, use %APPDATA% if sys.platform == 'win32' and 'APPDATA' in os.environ: homedir = os.environ['APPDATA'] # Otherwise, install in the user's home directory. else: homedir = os.path.expanduser('~/') if homedir == '~/': raise ValueError("Could not find a default download directory") # append "nltk_data" to the home directory return os.path.join(homedir, 'nltk_data') def _get_download_dir(self): """ The default directory to which packages will be downloaded. This defaults to the value returned by ``default_download_dir()``. To override this default on a case-by-case basis, use the ``download_dir`` argument when calling ``download()``. """ return self._download_dir def _set_download_dir(self, download_dir): self._download_dir = download_dir # Clear the status cache. self._status_cache.clear() download_dir = property(_get_download_dir, _set_download_dir) #///////////////////////////////////////////////////////////////// # Interactive Shell #///////////////////////////////////////////////////////////////// def _interactive_download(self): # Try the GUI first; if that doesn't work, try the simple # interactive shell. if TKINTER: try: DownloaderGUI(self).mainloop() except TclError: DownloaderShell(self).run() else: DownloaderShell(self).run() class DownloaderShell(object): def __init__(self, dataserver): self._ds = dataserver def _simple_interactive_menu(self, *options): print('-'*75) spc = (68 - sum(len(o) for o in options))//(len(options)-1)*' ' print(' ' + spc.join(options)) #w = 76/len(options) #fmt = ' ' + ('%-'+str(w)+'s')*(len(options)-1) + '%s' #print fmt % options print('-'*75) def run(self): print('NLTK Downloader') while True: self._simple_interactive_menu( 'd) Download', 'l) List', ' u) Update', 'c) Config', 'h) Help', 'q) Quit') user_input = compat.raw_input('Downloader> ').strip() if not user_input: print(); continue command = user_input.lower().split()[0] args = user_input.split()[1:] try: if command == 'l': print() self._ds.list(self._ds.download_dir, header=False, more_prompt=True) elif command == 'h': self._simple_interactive_help() elif command == 'c': self._simple_interactive_config() elif command in ('q', 'x'): return elif command == 'd': self._simple_interactive_download(args) elif command == 'u': self._simple_interactive_update() else: print('Command %r unrecognized' % user_input) except compat.HTTPError as e: print('Error reading from server: %s'%e) except compat.URLError as e: print('Error connecting to server: %s'%e.reason) # try checking if user_input is a package name, & # downloading it? print() def _simple_interactive_download(self, args): if args: for arg in args: try: self._ds.download(arg, prefix=' ') except (IOError, ValueError) as e: print(e) else: while True: print() print('Download which package (l=list; x=cancel)?') user_input = compat.raw_input(' Identifier> ') if user_input.lower()=='l': self._ds.list(self._ds.download_dir, header=False, more_prompt=True, skip_installed=True) continue elif user_input.lower() in ('x', 'q', ''): return elif user_input: for id in user_input.split(): try: self._ds.download(id, prefix=' ') except (IOError, ValueError) as e: print(e) break def _simple_interactive_update(self): while True: stale_packages = [] stale = partial = False for info in sorted(getattr(self._ds, 'packages')(), key=str): if self._ds.status(info) == self._ds.STALE: stale_packages.append((info.id, info.name)) print() if stale_packages: print('Will update following packages (o=ok; x=cancel)') for pid, pname in stale_packages: name = textwrap.fill('-'*27 + (pname), 75, subsequent_indent=27*' ')[27:] print(' [ ] %s %s' % (pid.ljust(20, '.'), name)) print() user_input = compat.raw_input(' Identifier> ') if user_input.lower()=='o': for pid, pname in stale_packages: try: self._ds.download(pid, prefix=' ') except (IOError, ValueError) as e: print(e) break elif user_input.lower() in ('x', 'q', ''): return else: print('Nothing to update.') return def _simple_interactive_help(self): print() print('Commands:') print(' d) Download a package or collection u) Update out of date packages') print(' l) List packages & collections h) Help') print(' c) View & Modify Configuration q) Quit') def _show_config(self): print() print('Data Server:') print(' - URL: <%s>' % self._ds.url) print((' - %d Package Collections Available' % len(self._ds.collections()))) print((' - %d Individual Packages Available' % len(self._ds.packages()))) print() print('Local Machine:') print(' - Data directory: %s' % self._ds.download_dir) def _simple_interactive_config(self): self._show_config() while True: print() self._simple_interactive_menu( 's) Show Config', 'u) Set Server URL', 'd) Set Data Dir', 'm) Main Menu') user_input = compat.raw_input('Config> ').strip().lower() if user_input == 's': self._show_config() elif user_input == 'd': new_dl_dir = compat.raw_input(' New Directory> ').strip() if new_dl_dir in ('', 'x', 'q', 'X', 'Q'): print(' Cancelled!') elif os.path.isdir(new_dl_dir): self._ds.download_dir = new_dl_dir else: print(('Directory %r not found! Create it first.' % new_dl_dir)) elif user_input == 'u': new_url = compat.raw_input(' New URL> ').strip() if new_url in ('', 'x', 'q', 'X', 'Q'): print(' Cancelled!') else: if not new_url.startswith('http://'): new_url = 'http://'+new_url try: self._ds.url = new_url except Exception as e: print('Error reading <%r>:\n %s' % (new_url, e)) elif user_input == 'm': break class DownloaderGUI(object): """ Graphical interface for downloading packages from the NLTK data server. """ #///////////////////////////////////////////////////////////////// # Column Configuration #///////////////////////////////////////////////////////////////// COLUMNS = ['', 'Identifier', 'Name', 'Size', 'Status', 'Unzipped Size', 'Copyright', 'Contact', 'License', 'Author', 'Subdir', 'Checksum'] """A list of the names of columns. This controls the order in which the columns will appear. If this is edited, then ``_package_to_columns()`` may need to be edited to match.""" COLUMN_WEIGHTS = {'': 0, 'Name': 5, 'Size': 0, 'Status': 0} """A dictionary specifying how columns should be resized when the table is resized. Columns with weight 0 will not be resized at all; and columns with high weight will be resized more. Default weight (for columns not explicitly listed) is 1.""" COLUMN_WIDTHS = {'':1, 'Identifier':20, 'Name':45, 'Size': 10, 'Unzipped Size': 10, 'Status': 12} """A dictionary specifying how wide each column should be, in characters. The default width (for columns not explicitly listed) is specified by ``DEFAULT_COLUMN_WIDTH``.""" DEFAULT_COLUMN_WIDTH = 30 """The default width for columns that are not explicitly listed in ``COLUMN_WIDTHS``.""" INITIAL_COLUMNS = ['', 'Identifier', 'Name', 'Size', 'Status'] """The set of columns that should be displayed by default.""" # Perform a few import-time sanity checks to make sure that the # column configuration variables are defined consistently: for c in COLUMN_WEIGHTS: assert c in COLUMNS for c in COLUMN_WIDTHS: assert c in COLUMNS for c in INITIAL_COLUMNS: assert c in COLUMNS #///////////////////////////////////////////////////////////////// # Color Configuration #///////////////////////////////////////////////////////////////// _BACKDROP_COLOR = ('#000', '#ccc') _ROW_COLOR = {Downloader.INSTALLED: ('#afa', '#080'), Downloader.PARTIAL: ('#ffa', '#880'), Downloader.STALE: ('#faa', '#800'), Downloader.NOT_INSTALLED: ('#fff', '#888')} _MARK_COLOR = ('#000', '#ccc') #_FRONT_TAB_COLOR = ('#ccf', '#008') #_BACK_TAB_COLOR = ('#88a', '#448') _FRONT_TAB_COLOR = ('#fff', '#45c') _BACK_TAB_COLOR = ('#aaa', '#67a') _PROGRESS_COLOR = ('#f00', '#aaa') _TAB_FONT = 'helvetica -16 bold' #///////////////////////////////////////////////////////////////// # Constructor #///////////////////////////////////////////////////////////////// def __init__(self, dataserver, use_threads=True): self._ds = dataserver self._use_threads = use_threads # For the threaded downloader: self._download_lock = threading.Lock() self._download_msg_queue = [] self._download_abort_queue = [] self._downloading = False # For tkinter after callbacks: self._afterid = {} # A message log. self._log_messages = [] self._log_indent = 0 self._log('NLTK Downloader Started!') # Create the main window. top = self.top = Tk() top.geometry('+50+50') top.title('NLTK Downloader') top.configure(background=self._BACKDROP_COLOR[1]) # Set up some bindings now, in case anything goes wrong. top.bind('', self.destroy) top.bind('', self.destroy) self._destroyed = False self._column_vars = {} # Initialize the GUI. self._init_widgets() self._init_menu() try: self._fill_table() except compat.HTTPError as e: showerror('Error reading from server', e) except compat.URLError as e: showerror('Error connecting to server', e.reason) self._show_info() self._select_columns() self._table.select(0) # Make sure we get notified when we're destroyed, so we can # cancel any download in progress. self._table.bind('', self._destroy) def _log(self, msg): self._log_messages.append('%s %s%s' % (time.ctime(), ' | '*self._log_indent, msg)) #///////////////////////////////////////////////////////////////// # Internals #///////////////////////////////////////////////////////////////// def _init_widgets(self): # Create the top-level frame structures f1 = Frame(self.top, relief='raised', border=2, padx=8, pady=0) f1.pack(sid='top', expand=True, fill='both') f1.grid_rowconfigure(2, weight=1) f1.grid_columnconfigure(0, weight=1) Frame(f1, height=8).grid(column=0, row=0) # spacer tabframe = Frame(f1) tabframe.grid(column=0, row=1, sticky='news') tableframe = Frame(f1) tableframe.grid(column=0, row=2, sticky='news') buttonframe = Frame(f1) buttonframe.grid(column=0, row=3, sticky='news') Frame(f1, height=8).grid(column=0, row=4) # spacer infoframe = Frame(f1) infoframe.grid(column=0, row=5, sticky='news') Frame(f1, height=8).grid(column=0, row=6) # spacer progressframe = Frame(self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1]) progressframe.pack(side='bottom', fill='x') self.top['border'] = 0 self.top['highlightthickness'] = 0 # Create the tabs self._tab_names = ['Collections', 'Corpora', 'Models', 'All Packages',] self._tabs = {} for i, tab in enumerate(self._tab_names): label = Label(tabframe, text=tab, font=self._TAB_FONT) label.pack(side='left', padx=((i+1)%2)*10) label.bind('', self._select_tab) self._tabs[tab.lower()] = label # Create the table. column_weights = [self.COLUMN_WEIGHTS.get(column, 1) for column in self.COLUMNS] self._table = Table(tableframe, self.COLUMNS, column_weights=column_weights, highlightthickness=0, listbox_height=16, reprfunc=self._table_reprfunc) self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked for i, column in enumerate(self.COLUMNS): width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH) self._table.columnconfig(i, width=width) self._table.pack(expand=True, fill='both') self._table.focus() self._table.bind_to_listboxes('', self._download) self._table.bind('', self._table_mark) self._table.bind('', self._download) self._table.bind('', self._prev_tab) self._table.bind('', self._next_tab) self._table.bind('', self._mark_all) # Create entry boxes for URL & download_dir infoframe.grid_columnconfigure(1, weight=1) info = [('url', 'Server Index:', self._set_url), ('download_dir','Download Directory:',self._set_download_dir)] self._info = {} for (i, (key, label, callback)) in enumerate(info): Label(infoframe, text=label).grid(column=0, row=i, sticky='e') entry = Entry(infoframe, font='courier', relief='groove', disabledforeground='black') self._info[key] = (entry, callback) entry.bind('', self._info_save) entry.bind('', lambda e,key=key: self._info_edit(key)) entry.grid(column=1, row=i, sticky='ew') # If the user edits url or download_dir, and then clicks outside # the entry box, then save their results. self.top.bind('', self._info_save) # Create Download & Refresh buttons. self._download_button = Button( buttonframe, text='Download', command=self._download, width=8) self._download_button.pack(side='left') self._refresh_button = Button( buttonframe, text='Refresh', command=self._refresh, width=8) self._refresh_button.pack(side='right') # Create Progress bar self._progresslabel = Label(progressframe, text='', foreground=self._BACKDROP_COLOR[0], background=self._BACKDROP_COLOR[1]) self._progressbar = Canvas(progressframe, width=200, height=16, background=self._PROGRESS_COLOR[1], relief='sunken', border=1) self._init_progressbar() self._progressbar.pack(side='right') self._progresslabel.pack(side='left') def _init_menu(self): menubar = Menu(self.top) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label='Download', underline=0, command=self._download, accelerator='Return') filemenu.add_separator() filemenu.add_command(label='Change Server Index', underline=7, command=lambda: self._info_edit('url')) filemenu.add_command(label='Change Download Directory', underline=0, command=lambda: self._info_edit('download_dir')) filemenu.add_separator() filemenu.add_command(label='Show Log', underline=5, command=self._show_log) filemenu.add_separator() filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x') menubar.add_cascade(label='File', underline=0, menu=filemenu) # Create a menu to control which columns of the table are # shown. n.b.: we never hide the first two columns (mark and # identifier). viewmenu = Menu(menubar, tearoff=0) for column in self._table.column_names[2:]: var = IntVar(self.top) assert column not in self._column_vars self._column_vars[column] = var if column in self.INITIAL_COLUMNS: var.set(1) viewmenu.add_checkbutton(label=column, underline=0, variable=var, command=self._select_columns) menubar.add_cascade(label='View', underline=0, menu=viewmenu) # Create a sort menu # [xx] this should be selectbuttons; and it should include # reversed sorts as options. sortmenu = Menu(menubar, tearoff=0) for column in self._table.column_names[1:]: sortmenu.add_command(label='Sort by %s' % column, command=(lambda c=column: self._table.sort_by(c, 'ascending'))) sortmenu.add_separator() #sortmenu.add_command(label='Descending Sort:') for column in self._table.column_names[1:]: sortmenu.add_command(label='Reverse sort by %s' % column, command=(lambda c=column: self._table.sort_by(c, 'descending'))) menubar.add_cascade(label='Sort', underline=0, menu=sortmenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label='About', underline=0, command=self.about) helpmenu.add_command(label='Instructions', underline=0, command=self.help, accelerator='F1') menubar.add_cascade(label='Help', underline=0, menu=helpmenu) self.top.bind('', self.help) self.top.config(menu=menubar) def _select_columns(self): for (column, var) in self._column_vars.items(): if var.get(): self._table.show_column(column) else: self._table.hide_column(column) def _refresh(self): self._ds.clear_status_cache() try: self._fill_table() except compat.HTTPError as e: showerror('Error reading from server', e) except compat.URLError as e: showerror('Error connecting to server', e.reason) self._table.select(0) def _info_edit(self, info_key): self._info_save() # just in case. (entry, callback) = self._info[info_key] entry['state'] = 'normal' entry['relief'] = 'sunken' entry.focus() def _info_save(self, e=None): focus = self._table for entry, callback in self._info.values(): if entry['state'] == 'disabled': continue if e is not None and e.widget is entry and e.keysym != 'Return': focus = entry else: entry['state'] = 'disabled' entry['relief'] = 'groove' callback(entry.get()) focus.focus() def _table_reprfunc(self, row, col, val): if self._table.column_names[col].endswith('Size'): if isinstance(val, compat.string_types): return ' %s' % val elif val < 1024**2: return ' %.1f KB' % (val/1024.**1) elif val < 1024**3: return ' %.1f MB' % (val/1024.**2) else: return ' %.1f GB' % (val/1024.**3) if col in (0, ''): return str(val) else: return ' %s' % val def _set_url(self, url): if url == self._ds.url: return try: self._ds.url = url self._fill_table() except IOError as e: showerror('Error Setting Server Index', str(e)) self._show_info() def _set_download_dir(self, download_dir): if self._ds.download_dir == download_dir: return # check if the dir exists, and if not, ask if we should create it? # Clear our status cache, & re-check what's installed self._ds.download_dir = download_dir try: self._fill_table() except compat.HTTPError as e: showerror('Error reading from server', e) except compat.URLError as e: showerror('Error connecting to server', e.reason) self._show_info() def _show_info(self): print('showing info', self._ds.url) for entry,cb in self._info.values(): entry['state'] = 'normal' entry.delete(0, 'end') self._info['url'][0].insert(0, self._ds.url) self._info['download_dir'][0].insert(0, self._ds.download_dir) for entry,cb in self._info.values(): entry['state'] = 'disabled' def _prev_tab(self, *e): for i, tab in enumerate(self._tab_names): if tab.lower() == self._tab and i > 0: self._tab = self._tab_names[i-1].lower() try: return self._fill_table() except compat.HTTPError as e: showerror('Error reading from server', e) except compat.URLError as e: showerror('Error connecting to server', e.reason) def _next_tab(self, *e): for i, tab in enumerate(self._tab_names): if tab.lower() == self._tab and i < (len(self._tabs)-1): self._tab = self._tab_names[i+1].lower() try: return self._fill_table() except compat.HTTPError as e: showerror('Error reading from server', e) except compat.URLError as e: showerror('Error connecting to server', e.reason) def _select_tab(self, event): self._tab = event.widget['text'].lower() try: self._fill_table() except compat.HTTPError as e: showerror('Error reading from server', e) except compat.URLError as e: showerror('Error connecting to server', e.reason) _tab = 'collections' #_tab = 'corpora' _rows = None def _fill_table(self): selected_row = self._table.selected_row() self._table.clear() if self._tab == 'all packages': items = self._ds.packages() elif self._tab == 'corpora': items = self._ds.corpora() elif self._tab == 'models': items = self._ds.models() elif self._tab == 'collections': items = self._ds.collections() else: assert 0, 'bad tab value %r' % self._tab rows = [self._package_to_columns(item) for item in items] self._table.extend(rows) # Highlight the active tab. for tab, label in self._tabs.items(): if tab == self._tab: label.configure(foreground=self._FRONT_TAB_COLOR[0], background=self._FRONT_TAB_COLOR[1]) else: label.configure(foreground=self._BACK_TAB_COLOR[0], background=self._BACK_TAB_COLOR[1]) self._table.sort_by('Identifier', order='ascending') self._color_table() self._table.select(selected_row) # This is a hack, because the scrollbar isn't updating its # position right -- I'm not sure what the underlying cause is # though. (This is on OS X w/ python 2.5) The length of # delay that's necessary seems to depend on how fast the # comptuer is. :-/ self.top.after(150, self._table._scrollbar.set, *self._table._mlb.yview()) self.top.after(300, self._table._scrollbar.set, *self._table._mlb.yview()) def _update_table_status(self): for row_num in range(len(self._table)): status = self._ds.status(self._table[row_num, 'Identifier']) self._table[row_num, 'Status'] = status self._color_table() def _download(self, *e): # If we're using threads, then delegate to the threaded # downloader instead. if self._use_threads: return self._download_threaded(*e) marked = [self._table[row, 'Identifier'] for row in range(len(self._table)) if self._table[row, 0] != ''] selection = self._table.selected_row() if not marked and selection is not None: marked = [self._table[selection, 'Identifier']] download_iter = self._ds.incr_download(marked, self._ds.download_dir) self._log_indent = 0 self._download_cb(download_iter, marked) _DL_DELAY=10 def _download_cb(self, download_iter, ids): try: msg = next(download_iter) except StopIteration: #self._fill_table(sort=False) self._update_table_status() afterid = self.top.after(10, self._show_progress, 0) self._afterid['_download_cb'] = afterid return def show(s): self._progresslabel['text'] = s self._log(s) if isinstance(msg, ProgressMessage): self._show_progress(msg.progress) elif isinstance(msg, ErrorMessage): show(msg.message) if msg.package is not None: self._select(msg.package.id) self._show_progress(None) return # halt progress. elif isinstance(msg, StartCollectionMessage): show('Downloading collection %s' % msg.collection.id) self._log_indent += 1 elif isinstance(msg, StartPackageMessage): show('Downloading package %s' % msg.package.id) elif isinstance(msg, UpToDateMessage): show('Package %s is up-to-date!' % msg.package.id) #elif isinstance(msg, StaleMessage): # show('Package %s is out-of-date or corrupt' % msg.package.id) elif isinstance(msg, FinishDownloadMessage): show('Finished downloading %r.' % msg.package.id) elif isinstance(msg, StartUnzipMessage): show('Unzipping %s' % msg.package.filename) elif isinstance(msg, FinishCollectionMessage): self._log_indent -= 1 show('Finished downloading collection %r.' % msg.collection.id) self._clear_mark(msg.collection.id) elif isinstance(msg, FinishPackageMessage): self._clear_mark(msg.package.id) afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids) self._afterid['_download_cb'] = afterid def _select(self, id): for row in range(len(self._table)): if self._table[row, 'Identifier'] == id: self._table.select(row) return def _color_table(self): # Color rows according to status. for row in range(len(self._table)): bg, sbg = self._ROW_COLOR[self._table[row, 'Status']] fg, sfg = ('black', 'white') self._table.rowconfig(row, foreground=fg, selectforeground=sfg, background=bg, selectbackground=sbg) # Color the marked column self._table.itemconfigure(row, 0, foreground=self._MARK_COLOR[0], background=self._MARK_COLOR[1]) def _clear_mark(self, id): for row in range(len(self._table)): if self._table[row, 'Identifier'] == id: self._table[row, 0] = '' def _mark_all(self, *e): for row in range(len(self._table)): self._table[row,0] = 'X' def _table_mark(self, *e): selection = self._table.selected_row() if selection >= 0: if self._table[selection][0] != '': self._table[selection,0] = '' else: self._table[selection,0] = 'X' self._table.select(delta=1) def _show_log(self): text = '\n'.join(self._log_messages) ShowText(self.top, 'NLTK Downloader Log', text) def _package_to_columns(self, pkg): """ Given a package, return a list of values describing that package, one for each column in ``self.COLUMNS``. """ row = [] for column_index, column_name in enumerate(self.COLUMNS): if column_index == 0: # Mark: row.append('') elif column_name == 'Identifier': row.append(pkg.id) elif column_name == 'Status': row.append(self._ds.status(pkg)) else: attr = column_name.lower().replace(' ', '_') row.append(getattr(pkg, attr, 'n/a')) return row #///////////////////////////////////////////////////////////////// # External Interface #///////////////////////////////////////////////////////////////// def destroy(self, *e): if self._destroyed: return self.top.destroy() self._destroyed = True def _destroy(self, *e): if self.top is not None: for afterid in self._afterid.values(): self.top.after_cancel(afterid) # Abort any download in progress. if self._downloading and self._use_threads: self._abort_download() # Make sure the garbage collector destroys these now; # otherwise, they may get destroyed when we're not in the main # thread, which would make Tkinter unhappy. self._column_vars.clear() def mainloop(self, *args, **kwargs): self.top.mainloop(*args, **kwargs) #///////////////////////////////////////////////////////////////// # HELP #///////////////////////////////////////////////////////////////// HELP = textwrap.dedent("""\ This tool can be used to download a variety of corpora and models that can be used with NLTK. Each corpus or model is distributed in a single zip file, known as a \"package file.\" You can download packages individually, or you can download pre-defined collections of packages. When you download a package, it will be saved to the \"download directory.\" A default download directory is chosen when you run the downloader; but you may also select a different download directory. On Windows, the default download directory is \"package.\" The NLTK downloader can be used to download a variety of corpora, models, and other data packages. Keyboard shortcuts:: [return]\t Download [up]\t Select previous package [down]\t Select next package [left]\t Select previous tab [right]\t Select next tab """) def help(self, *e): # The default font's not very legible; try using 'fixed' instead. try: ShowText(self.top, 'Help: NLTK Dowloader', self.HELP.strip(), width=75, font='fixed') except: ShowText(self.top, 'Help: NLTK Downloader', self.HELP.strip(), width=75) def about(self, *e): ABOUT = ("NLTK Downloader\n"+ "Written by Edward Loper") TITLE = 'About: NLTK Downloader' try: from tkMessageBox import Message Message(message=ABOUT, title=TITLE).show() except ImportError: try: from tkinter.messagebox import Message Message(message=ABOUT, title=TITLE).show() except ImportError: ShowText(self.top, TITLE, ABOUT) #///////////////////////////////////////////////////////////////// # Progress Bar #///////////////////////////////////////////////////////////////// _gradient_width = 5 def _init_progressbar(self): c = self._progressbar width, height = int(c['width']), int(c['height']) for i in range(0, (int(c['width'])*2)//self._gradient_width): c.create_line(i*self._gradient_width+20, -20, i*self._gradient_width-height-20, height+20, width=self._gradient_width, fill='#%02x0000' % (80 + abs(i%6-3)*12)) c.addtag_all('gradient') c.itemconfig('gradient', state='hidden') # This is used to display progress c.addtag_withtag('redbox', c.create_rectangle( 0, 0, 0, 0, fill=self._PROGRESS_COLOR[0])) def _show_progress(self, percent): c = self._progressbar if percent is None: c.coords('redbox', 0, 0, 0, 0) c.itemconfig('gradient', state='hidden') else: width, height = int(c['width']), int(c['height']) x = percent * int(width) // 100 + 1 c.coords('redbox', 0, 0, x, height+1) def _progress_alive(self): c = self._progressbar if not self._downloading: c.itemconfig('gradient', state='hidden') else: c.itemconfig('gradient', state='normal') x1, y1, x2, y2 = c.bbox('gradient') if x1 <= -100: c.move('gradient', (self._gradient_width*6)-4, 0) else: c.move('gradient', -4, 0) afterid = self.top.after(200, self._progress_alive) self._afterid['_progress_alive'] = afterid #///////////////////////////////////////////////////////////////// # Threaded downloader #///////////////////////////////////////////////////////////////// def _download_threaded(self, *e): # If the user tries to start a new download while we're already # downloading something, then abort the current download instead. if self._downloading: self._abort_download() return # Change the 'download' button to an 'abort' button. self._download_button['text'] = 'Cancel' marked = [self._table[row, 'Identifier'] for row in range(len(self._table)) if self._table[row, 0] != ''] selection = self._table.selected_row() if not marked and selection is not None: marked = [self._table[selection, 'Identifier']] # Create a new data server object for the download operation, # just in case the user modifies our data server during the # download (e.g., clicking 'refresh' or editing the index url). ds = Downloader(self._ds.url, self._ds.download_dir) # Start downloading in a separate thread. assert self._download_msg_queue == [] assert self._download_abort_queue == [] self._DownloadThread(ds, marked, self._download_lock, self._download_msg_queue, self._download_abort_queue).start() # Monitor the download message queue & display its progress. self._log_indent = 0 self._downloading = True self._monitor_message_queue() # Display an indication that we're still alive and well by # cycling the progress bar. self._progress_alive() def _abort_download(self): if self._downloading: self._download_lock.acquire() self._download_abort_queue.append('abort') self._download_lock.release() class _DownloadThread(threading.Thread): def __init__(self, data_server, items, lock, message_queue, abort): self.data_server = data_server self.items = items self.lock = lock self.message_queue = message_queue self.abort = abort threading.Thread.__init__(self) def run (self): for msg in self.data_server.incr_download(self.items): self.lock.acquire() self.message_queue.append(msg) # Check if we've been told to kill ourselves: if self.abort: self.message_queue.append('aborted') self.lock.release() return self.lock.release() self.lock.acquire() self.message_queue.append('finished') self.lock.release() _MONITOR_QUEUE_DELAY=100 def _monitor_message_queue(self): def show(s): self._progresslabel['text'] = s self._log(s) # Try to acquire the lock; if it's busy, then just try again later. if not self._download_lock.acquire(): return for msg in self._download_msg_queue: # Done downloading? if msg == 'finished' or msg == 'aborted': #self._fill_table(sort=False) self._update_table_status() self._downloading = False self._download_button['text'] = 'Download' del self._download_msg_queue[:] del self._download_abort_queue[:] self._download_lock.release() if msg == 'aborted': show('Download aborted!') self._show_progress(None) else: afterid = self.top.after(100, self._show_progress, None) self._afterid['_monitor_message_queue'] = afterid return # All other messages elif isinstance(msg, ProgressMessage): self._show_progress(msg.progress) elif isinstance(msg, ErrorMessage): show(msg.message) if msg.package is not None: self._select(msg.package.id) self._show_progress(None) self._downloading = False return # halt progress. elif isinstance(msg, StartCollectionMessage): show('Downloading collection %r' % msg.collection.id) self._log_indent += 1 elif isinstance(msg, StartPackageMessage): self._ds.clear_status_cache(msg.package.id) show('Downloading package %r' % msg.package.id) elif isinstance(msg, UpToDateMessage): show('Package %s is up-to-date!' % msg.package.id) #elif isinstance(msg, StaleMessage): # show('Package %s is out-of-date or corrupt; updating it' % # msg.package.id) elif isinstance(msg, FinishDownloadMessage): show('Finished downloading %r.' % msg.package.id) elif isinstance(msg, StartUnzipMessage): show('Unzipping %s' % msg.package.filename) elif isinstance(msg, FinishUnzipMessage): show('Finished installing %s' % msg.package.id) elif isinstance(msg, FinishCollectionMessage): self._log_indent -= 1 show('Finished downloading collection %r.' % msg.collection.id) self._clear_mark(msg.collection.id) elif isinstance(msg, FinishPackageMessage): self._update_table_status() self._clear_mark(msg.package.id) # Let the user know when we're aborting a download (but # waiting for a good point to abort it, so we don't end up # with a partially unzipped package or anything like that). if self._download_abort_queue: self._progresslabel['text'] = 'Aborting download...' # Clear the message queue and then release the lock del self._download_msg_queue[:] self._download_lock.release() # Check the queue again after MONITOR_QUEUE_DELAY msec. afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue) self._afterid['_monitor_message_queue'] = afterid ###################################################################### # Helper Functions ###################################################################### # [xx] It may make sense to move these to nltk.internals. def md5_hexdigest(file): """ Calculate and return the MD5 checksum for a given file. ``file`` may either be a filename or an open stream. """ if isinstance(file, compat.string_types): with open(file, 'rb') as infile: return _md5_hexdigest(infile) return _md5_hexdigest(file) def _md5_hexdigest(fp): md5_digest = md5() while True: block = fp.read(1024*16) # 16k blocks if not block: break md5_digest.update(block) return md5_digest.hexdigest() # change this to periodically yield progress messages? # [xx] get rid of topdir parameter -- we should be checking # this when we build the index, anyway. def unzip(filename, root, verbose=True): """ Extract the contents of the zip file ``filename`` into the directory ``root``. """ for message in _unzip_iter(filename, root, verbose): if isinstance(message, ErrorMessage): raise Exception(message) def _unzip_iter(filename, root, verbose=True): if verbose: sys.stdout.write('Unzipping %s' % os.path.split(filename)[1]) sys.stdout.flush() try: zf = zipfile.ZipFile(filename) except zipfile.error as e: yield ErrorMessage(filename, 'Error with downloaded zip file') return except Exception as e: yield ErrorMessage(filename, e) return # Get lists of directories & files namelist = zf.namelist() dirlist = set() for x in namelist: if x.endswith('/'): dirlist.add(x) else: dirlist.add(x.rsplit('/',1)[0] + '/') filelist = [x for x in namelist if not x.endswith('/')] # Create the target directory if it doesn't exist if not os.path.exists(root): os.mkdir(root) # Create the directory structure for dirname in sorted(dirlist): pieces = dirname[:-1].split('/') for i in range(len(pieces)): dirpath = os.path.join(root, *pieces[:i+1]) if not os.path.exists(dirpath): os.mkdir(dirpath) # Extract files. for i, filename in enumerate(filelist): filepath = os.path.join(root, *filename.split('/')) with open(filepath, 'wb') as outfile: try: contents = zf.read(filename) except Exception as e: yield ErrorMessage(filename, e) return outfile.write(contents) if verbose and (i*10/len(filelist) > (i-1)*10/len(filelist)): sys.stdout.write('.') sys.stdout.flush() if verbose: print() ###################################################################### # Index Builder ###################################################################### # This may move to a different file sometime. import subprocess, zipfile def build_index(root, base_url): """ Create a new data.xml index file, by combining the xml description files for various packages and collections. ``root`` should be the path to a directory containing the package xml and zip files; and the collection xml files. The ``root`` directory is expected to have the following subdirectories:: root/ packages/ .................. subdirectory for packages corpora/ ................. zip & xml files for corpora grammars/ ................ zip & xml files for grammars taggers/ ................. zip & xml files for taggers tokenizers/ .............. zip & xml files for tokenizers etc. collections/ ............... xml files for collections For each package, there should be two files: ``package.zip`` (where *package* is the package name) which contains the package itself as a compressed zip file; and ``package.xml``, which is an xml description of the package. The zipfile ``package.zip`` should expand to a single subdirectory named ``package/``. The base filename ``package`` must match the identifier given in the package's xml file. For each collection, there should be a single file ``collection.zip`` describing the collection, where *collection* is the name of the collection. All identifiers (for both packages and collections) must be unique. """ # Find all packages. packages = [] for pkg_xml, zf, subdir in _find_packages(os.path.join(root, 'packages')): zipstat = os.stat(zf.filename) url = '%s/%s/%s' % (base_url, subdir, os.path.split(zf.filename)[1]) unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist()) # Fill in several fields of the package xml with calculated values. pkg_xml.set('unzipped_size', '%s' % unzipped_size) pkg_xml.set('size', '%s' % zipstat.st_size) pkg_xml.set('checksum', '%s' % md5_hexdigest(zf.filename)) pkg_xml.set('subdir', subdir) #pkg_xml.set('svn_revision', _svn_revision(zf.filename)) pkg_xml.set('url', url) # Record the package. packages.append(pkg_xml) # Find all collections collections = list(_find_collections(os.path.join(root, 'collections'))) # Check that all UIDs are unique uids = set() for item in packages+collections: if item.get('id') in uids: raise ValueError('Duplicate UID: %s' % item.get('id')) uids.add(item.get('id')) # Put it all together top_elt = ElementTree.Element('nltk_data') top_elt.append(ElementTree.Element('packages')) for package in packages: top_elt[0].append(package) top_elt.append(ElementTree.Element('collections')) for collection in collections: top_elt[1].append(collection) _indent_xml(top_elt) return top_elt def _indent_xml(xml, prefix=''): """ Helper for ``build_index()``: Given an XML ``ElementTree``, modify it (and its descendents) ``text`` and ``tail`` attributes to generate an indented tree, where each nested element is indented by 2 spaces with respect to its parent. """ if len(xml) > 0: xml.text = (xml.text or '').strip() + '\n' + prefix + ' ' for child in xml: _indent_xml(child, prefix+' ') for child in xml[:-1]: child.tail = (child.tail or '').strip() + '\n' + prefix + ' ' xml[-1].tail = (xml[-1].tail or '').strip() + '\n' + prefix def _check_package(pkg_xml, zipfilename, zf): """ Helper for ``build_index()``: Perform some checks to make sure that the given package is consistent. """ # The filename must patch the id given in the XML file. uid = os.path.splitext(os.path.split(zipfilename)[1])[0] if pkg_xml.get('id') != uid: raise ValueError('package identifier mismatch (%s vs %s)' % (pkg_xml.get('id'), uid)) # Zip file must expand to a subdir whose name matches uid. if sum( (name!=uid and not name.startswith(uid+'/')) for name in zf.namelist() ): raise ValueError('Zipfile %s.zip does not expand to a single ' 'subdirectory %s/' % (uid, uid)) # update for git? def _svn_revision(filename): """ Helper for ``build_index()``: Calculate the subversion revision number for a given file (by using ``subprocess`` to run ``svn``). """ p = subprocess.Popen(['svn', 'status', '-v', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = p.communicate() if p.returncode != 0 or stderr or not stdout: raise ValueError('Error determining svn_revision for %s: %s' % (os.path.split(filename)[1], textwrap.fill(stderr))) return stdout.split()[2] def _find_collections(root): """ Helper for ``build_index()``: Yield a list of ElementTree.Element objects, each holding the xml for a single package collection. """ packages = [] for dirname, subdirs, files in os.walk(root): for filename in files: if filename.endswith('.xml'): xmlfile = os.path.join(dirname, filename) yield ElementTree.parse(xmlfile).getroot() def _find_packages(root): """ Helper for ``build_index()``: Yield a list of tuples ``(pkg_xml, zf, subdir)``, where: - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a package - ``zf`` is a ``zipfile.ZipFile`` for the package's contents. - ``subdir`` is the subdirectory (relative to ``root``) where the package was found (e.g. 'corpora' or 'grammars'). """ from nltk.corpus.reader.util import _path_from # Find all packages. packages = [] for dirname, subdirs, files in os.walk(root): relpath = '/'.join(_path_from(root, dirname)) for filename in files: if filename.endswith('.xml'): xmlfilename = os.path.join(dirname, filename) zipfilename = xmlfilename[:-4]+'.zip' try: zf = zipfile.ZipFile(zipfilename) except Exception as e: raise ValueError('Error reading file %r!\n%s' % (zipfilename, e)) try: pkg_xml = ElementTree.parse(xmlfilename).getroot() except Exception as e: raise ValueError('Error reading file %r!\n%s' % (xmlfilename, e)) # Check that the UID matches the filename uid = os.path.split(xmlfilename[:-4])[1] if pkg_xml.get('id') != uid: raise ValueError('package identifier mismatch (%s ' 'vs %s)' % (pkg_xml.get('id'), uid)) # Check that the zipfile expands to a subdir whose # name matches the uid. if sum( (name!=uid and not name.startswith(uid+'/')) for name in zf.namelist() ): raise ValueError('Zipfile %s.zip does not expand to a ' 'single subdirectory %s/' % (uid, uid)) yield pkg_xml, zf, relpath # Don't recurse into svn subdirectories: try: subdirs.remove('.svn') except ValueError: pass ###################################################################### # Main: ###################################################################### # There should be a command-line interface # Aliases _downloader = Downloader() download = _downloader.download def download_shell(): DownloaderShell(_downloader).run() def download_gui(): DownloaderGUI(_downloader).mainloop() def update(): _downloader.update() if __name__ == '__main__': from optparse import OptionParser parser = OptionParser() parser.add_option("-d", "--dir", dest="dir", help="download package to directory DIR", metavar="DIR") parser.add_option("-q", "--quiet", dest="quiet", action="store_true", default=False, help="work quietly") parser.add_option("-f", "--force", dest="force", action="store_true", default=False, help="download even if already installed") parser.add_option("-e", "--exit-on-error", dest="halt_on_error", action="store_true", default=False, help="exit if an error occurs") parser.add_option("-u", "--url", dest="server_index_url", default=None, help="download server index url") (options, args) = parser.parse_args() downloader = Downloader(server_index_url = options.server_index_url) if args: for pkg_id in args: rv = downloader.download(info_or_id=pkg_id, download_dir=options.dir, quiet=options.quiet, force=options.force, halt_on_error=options.halt_on_error) if rv==False and options.halt_on_error: break else: downloader.download(download_dir=options.dir, quiet=options.quiet, force=options.force, halt_on_error=options.halt_on_error) nltk-3.1/nltk/draw/0000755000076500000240000000000012610001541013747 5ustar sbstaff00000000000000nltk-3.1/nltk/draw/__init__.py0000644000076500000240000000166012607224144016077 0ustar sbstaff00000000000000# Natural Language Toolkit: graphical representations package # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT # Import Tkinter-based modules if Tkinter is installed import nltk.compat try: import tkinter except ImportError: import warnings warnings.warn("nltk.draw package not loaded " "(please install Tkinter library).") else: from nltk.draw.cfg import ProductionList, CFGEditor, CFGDemo from nltk.draw.tree import (TreeSegmentWidget, tree_to_treesegment, TreeWidget, TreeView, draw_trees) from nltk.draw.dispersion import dispersion_plot from nltk.draw.table import Table # skip doctests from this package def setup_module(module): from nose import SkipTest raise SkipTest("nltk.draw examples are not doctests") nltk-3.1/nltk/draw/cfg.py0000644000076500000240000007146512607224144015111 0ustar sbstaff00000000000000# Natural Language Toolkit: CFG visualization # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Visualization tools for CFGs. """ # Idea for a nice demo: # - 3 panes: grammar, treelet, working area # - grammar is a list of productions # - when you select a production, the treelet that it licenses appears # in the treelet area # - the working area has the text on the bottom, and S at top. When # you select a production, it shows (ghosted) the locations where # that production's treelet could be attached to either the text # or the tree rooted at S. # - the user can drag the treelet onto one of those (or click on them?) # - the user can delete pieces of the tree from the working area # (right click?) # - connecting top to bottom? drag one NP onto another? # # +-------------------------------------------------------------+ # | S -> NP VP | S | # |[NP -> Det N ]| / \ | # | ... | NP VP | # | N -> 'dog' | | # | N -> 'cat' | | # | ... | | # +--------------+ | # | NP | Det N | # | / \ | | | | # | Det N | the cat saw the dog | # | | | # +--------------+----------------------------------------------+ # # Operations: # - connect a new treelet -- drag or click shadow # - delete a treelet -- right click # - if only connected to top, delete everything below # - if only connected to bottom, delete everything above # - connect top & bottom -- drag a leaf to a root or a root to a leaf # - disconnect top & bottom -- right click # - if connected to top & bottom, then disconnect import nltk.compat import re from tkinter import (Button, Canvas, Entry, Frame, IntVar, Label, Scrollbar, Text, Tk, Toplevel) from nltk.grammar import (CFG, _read_cfg_production, Nonterminal, nonterminals) from nltk.tree import Tree from nltk.draw.tree import TreeSegmentWidget, tree_to_treesegment from nltk.draw.util import (CanvasFrame, ColorizedList, ShowText, SymbolWidget, TextWidget) from nltk import compat ###################################################################### # Production List ###################################################################### class ProductionList(ColorizedList): ARROW = SymbolWidget.SYMBOLS['rightarrow'] def _init_colortags(self, textwidget, options): textwidget.tag_config('terminal', foreground='#006000') textwidget.tag_config('arrow', font='symbol', underline='0') textwidget.tag_config('nonterminal', foreground='blue', font=('helvetica', -12, 'bold')) def _item_repr(self, item): contents = [] contents.append(('%s\t' % item.lhs(), 'nonterminal')) contents.append((self.ARROW, 'arrow')) for elt in item.rhs(): if isinstance(elt, Nonterminal): contents.append((' %s' % elt.symbol(), 'nonterminal')) else: contents.append((' %r' % elt, 'terminal')) return contents ###################################################################### # CFG Editor ###################################################################### _CFGEditor_HELP = """ The CFG Editor can be used to create or modify context free grammars. A context free grammar consists of a start symbol and a list of productions. The start symbol is specified by the text entry field in the upper right hand corner of the editor; and the list of productions are specified in the main text editing box. Every non-blank line specifies a single production. Each production has the form "LHS -> RHS," where LHS is a single nonterminal, and RHS is a list of nonterminals and terminals. Nonterminals must be a single word, such as S or NP or NP_subj. Currently, nonterminals must consists of alphanumeric characters and underscores (_). Nonterminals are colored blue. If you place the mouse over any nonterminal, then all occurrences of that nonterminal will be highlighted. Termianals must be surrounded by single quotes (') or double quotes(\"). For example, "dog" and "New York" are terminals. Currently, the string within the quotes must consist of alphanumeric characters, underscores, and spaces. To enter a new production, go to a blank line, and type a nonterminal, followed by an arrow (->), followed by a sequence of terminals and nonterminals. Note that "->" (dash + greater-than) is automatically converted to an arrow symbol. When you move your cursor to a different line, your production will automatically be colorized. If there are any errors, they will be highlighted in red. Note that the order of the productions is significant for some algorithms. To re-order the productions, use cut and paste to move them. Use the buttons at the bottom of the window when you are done editing the CFG: - Ok: apply the new CFG, and exit the editor. - Apply: apply the new CFG, and do not exit the editor. - Reset: revert to the original CFG, and do not exit the editor. - Cancel: revert to the original CFG, and exit the editor. """ class CFGEditor(object): """ A dialog window for creating and editing context free grammars. ``CFGEditor`` imposes the following restrictions: - All nonterminals must be strings consisting of word characters. - All terminals must be strings consisting of word characters and space characters. """ # Regular expressions used by _analyze_line. Precompile them, so # we can process the text faster. ARROW = SymbolWidget.SYMBOLS['rightarrow'] _LHS_RE = re.compile(r"(^\s*\w+\s*)(->|("+ARROW+"))") _ARROW_RE = re.compile("\s*(->|("+ARROW+"))\s*") _PRODUCTION_RE = re.compile(r"(^\s*\w+\s*)" + # LHS "(->|("+ARROW+"))\s*" + # arrow r"((\w+|'[\w ]*'|\"[\w ]*\"|\|)\s*)*$") # RHS _TOKEN_RE = re.compile("\\w+|->|'[\\w ]+'|\"[\\w ]+\"|("+ARROW+")") _BOLD = ('helvetica', -12, 'bold') def __init__(self, parent, cfg=None, set_cfg_callback=None): self._parent = parent if cfg is not None: self._cfg = cfg else: self._cfg = CFG(Nonterminal('S'), []) self._set_cfg_callback = set_cfg_callback self._highlight_matching_nonterminals = 1 # Create the top-level window. self._top = Toplevel(parent) self._init_bindings() self._init_startframe() self._startframe.pack(side='top', fill='x', expand=0) self._init_prodframe() self._prodframe.pack(side='top', fill='both', expand=1) self._init_buttons() self._buttonframe.pack(side='bottom', fill='x', expand=0) self._textwidget.focus() def _init_startframe(self): frame = self._startframe = Frame(self._top) self._start = Entry(frame) self._start.pack(side='right') Label(frame, text='Start Symbol:').pack(side='right') Label(frame, text='Productions:').pack(side='left') self._start.insert(0, self._cfg.start().symbol()) def _init_buttons(self): frame = self._buttonframe = Frame(self._top) Button(frame, text='Ok', command=self._ok, underline=0, takefocus=0).pack(side='left') Button(frame, text='Apply', command=self._apply, underline=0, takefocus=0).pack(side='left') Button(frame, text='Reset', command=self._reset, underline=0, takefocus=0,).pack(side='left') Button(frame, text='Cancel', command=self._cancel, underline=0, takefocus=0).pack(side='left') Button(frame, text='Help', command=self._help, underline=0, takefocus=0).pack(side='right') def _init_bindings(self): self._top.title('CFG Editor') self._top.bind('', self._cancel) self._top.bind('', self._cancel) self._top.bind('', self._cancel) #self._top.bind('', self._cancel) self._top.bind('', self._cancel) self._top.bind('', self._cancel) #self._top.bind('', self._cancel) self._top.bind('', self._cancel) self._top.bind('', self._ok) self._top.bind('', self._ok) self._top.bind('', self._apply) self._top.bind('', self._apply) self._top.bind('', self._reset) self._top.bind('', self._reset) self._top.bind('', self._help) self._top.bind('', self._help) self._top.bind('', self._help) def _init_prodframe(self): self._prodframe = Frame(self._top) # Create the basic Text widget & scrollbar. self._textwidget = Text(self._prodframe, background='#e0e0e0', exportselection=1) self._textscroll = Scrollbar(self._prodframe, takefocus=0, orient='vertical') self._textwidget.config(yscrollcommand = self._textscroll.set) self._textscroll.config(command=self._textwidget.yview) self._textscroll.pack(side='right', fill='y') self._textwidget.pack(expand=1, fill='both', side='left') # Initialize the colorization tags. Each nonterminal gets its # own tag, so they aren't listed here. self._textwidget.tag_config('terminal', foreground='#006000') self._textwidget.tag_config('arrow', font='symbol') self._textwidget.tag_config('error', background='red') # Keep track of what line they're on. We use that to remember # to re-analyze a line whenever they leave it. self._linenum = 0 # Expand "->" to an arrow. self._top.bind('>', self._replace_arrows) # Re-colorize lines when appropriate. self._top.bind('<>', self._analyze) self._top.bind('', self._check_analyze) self._top.bind('', self._check_analyze) # Tab cycles focus. (why doesn't this work??) def cycle(e, textwidget=self._textwidget): textwidget.tk_focusNext().focus() self._textwidget.bind('', cycle) prod_tuples = [(p.lhs(),[p.rhs()]) for p in self._cfg.productions()] for i in range(len(prod_tuples)-1,0,-1): if (prod_tuples[i][0] == prod_tuples[i-1][0]): if () in prod_tuples[i][1]: continue if () in prod_tuples[i-1][1]: continue print(prod_tuples[i-1][1]) print(prod_tuples[i][1]) prod_tuples[i-1][1].extend(prod_tuples[i][1]) del prod_tuples[i] for lhs, rhss in prod_tuples: print(lhs, rhss) s = '%s ->' % lhs for rhs in rhss: for elt in rhs: if isinstance(elt, Nonterminal): s += ' %s' % elt else: s += ' %r' % elt s += ' |' s = s[:-2] + '\n' self._textwidget.insert('end', s) self._analyze() # # Add the producitons to the text widget, and colorize them. # prod_by_lhs = {} # for prod in self._cfg.productions(): # if len(prod.rhs()) > 0: # prod_by_lhs.setdefault(prod.lhs(),[]).append(prod) # for (lhs, prods) in prod_by_lhs.items(): # self._textwidget.insert('end', '%s ->' % lhs) # self._textwidget.insert('end', self._rhs(prods[0])) # for prod in prods[1:]: # print '\t|'+self._rhs(prod), # self._textwidget.insert('end', '\t|'+self._rhs(prod)) # print # self._textwidget.insert('end', '\n') # for prod in self._cfg.productions(): # if len(prod.rhs()) == 0: # self._textwidget.insert('end', '%s' % prod) # self._analyze() # def _rhs(self, prod): # s = '' # for elt in prod.rhs(): # if isinstance(elt, Nonterminal): s += ' %s' % elt.symbol() # else: s += ' %r' % elt # return s def _clear_tags(self, linenum): """ Remove all tags (except ``arrow`` and ``sel``) from the given line of the text widget used for editing the productions. """ start = '%d.0'%linenum end = '%d.end'%linenum for tag in self._textwidget.tag_names(): if tag not in ('arrow', 'sel'): self._textwidget.tag_remove(tag, start, end) def _check_analyze(self, *e): """ Check if we've moved to a new line. If we have, then remove all colorization from the line we moved to, and re-colorize the line that we moved from. """ linenum = int(self._textwidget.index('insert').split('.')[0]) if linenum != self._linenum: self._clear_tags(linenum) self._analyze_line(self._linenum) self._linenum = linenum def _replace_arrows(self, *e): """ Replace any ``'->'`` text strings with arrows (char \\256, in symbol font). This searches the whole buffer, but is fast enough to be done anytime they press '>'. """ arrow = '1.0' while True: arrow = self._textwidget.search('->', arrow, 'end+1char') if arrow == '': break self._textwidget.delete(arrow, arrow+'+2char') self._textwidget.insert(arrow, self.ARROW, 'arrow') self._textwidget.insert(arrow, '\t') arrow = '1.0' while True: arrow = self._textwidget.search(self.ARROW, arrow+'+1char', 'end+1char') if arrow == '': break self._textwidget.tag_add('arrow', arrow, arrow+'+1char') def _analyze_token(self, match, linenum): """ Given a line number and a regexp match for a token on that line, colorize the token. Note that the regexp match gives us the token's text, start index (on the line), and end index (on the line). """ # What type of token is it? if match.group()[0] in "'\"": tag = 'terminal' elif match.group() in ('->', self.ARROW): tag = 'arrow' else: # If it's a nonterminal, then set up new bindings, so we # can highlight all instances of that nonterminal when we # put the mouse over it. tag = 'nonterminal_'+match.group() if tag not in self._textwidget.tag_names(): self._init_nonterminal_tag(tag) start = '%d.%d' % (linenum, match.start()) end = '%d.%d' % (linenum, match.end()) self._textwidget.tag_add(tag, start, end) def _init_nonterminal_tag(self, tag, foreground='blue'): self._textwidget.tag_config(tag, foreground=foreground, font=CFGEditor._BOLD) if not self._highlight_matching_nonterminals: return def enter(e, textwidget=self._textwidget, tag=tag): textwidget.tag_config(tag, background='#80ff80') def leave(e, textwidget=self._textwidget, tag=tag): textwidget.tag_config(tag, background='') self._textwidget.tag_bind(tag, '', enter) self._textwidget.tag_bind(tag, '', leave) def _analyze_line(self, linenum): """ Colorize a given line. """ # Get rid of any tags that were previously on the line. self._clear_tags(linenum) # Get the line line's text string. line = self._textwidget.get(repr(linenum)+'.0', repr(linenum)+'.end') # If it's a valid production, then colorize each token. if CFGEditor._PRODUCTION_RE.match(line): # It's valid; Use _TOKEN_RE to tokenize the production, # and call analyze_token on each token. def analyze_token(match, self=self, linenum=linenum): self._analyze_token(match, linenum) return '' CFGEditor._TOKEN_RE.sub(analyze_token, line) elif line.strip() != '': # It's invalid; show the user where the error is. self._mark_error(linenum, line) def _mark_error(self, linenum, line): """ Mark the location of an error in a line. """ arrowmatch = CFGEditor._ARROW_RE.search(line) if not arrowmatch: # If there's no arrow at all, highlight the whole line. start = '%d.0' % linenum end = '%d.end' % linenum elif not CFGEditor._LHS_RE.match(line): # Otherwise, if the LHS is bad, highlight it. start = '%d.0' % linenum end = '%d.%d' % (linenum, arrowmatch.start()) else: # Otherwise, highlight the RHS. start = '%d.%d' % (linenum, arrowmatch.end()) end = '%d.end' % linenum # If we're highlighting 0 chars, highlight the whole line. if self._textwidget.compare(start, '==', end): start = '%d.0' % linenum end = '%d.end' % linenum self._textwidget.tag_add('error', start, end) def _analyze(self, *e): """ Replace ``->`` with arrows, and colorize the entire buffer. """ self._replace_arrows() numlines = int(self._textwidget.index('end').split('.')[0]) for linenum in range(1, numlines+1): # line numbers start at 1. self._analyze_line(linenum) def _parse_productions(self): """ Parse the current contents of the textwidget buffer, to create a list of productions. """ productions = [] # Get the text, normalize it, and split it into lines. text = self._textwidget.get('1.0', 'end') text = re.sub(self.ARROW, '->', text) text = re.sub('\t', ' ', text) lines = text.split('\n') # Convert each line to a CFG production for line in lines: line = line.strip() if line=='': continue productions += _read_cfg_production(line) #if line.strip() == '': continue #if not CFGEditor._PRODUCTION_RE.match(line): # raise ValueError('Bad production string %r' % line) # #(lhs_str, rhs_str) = line.split('->') #lhs = Nonterminal(lhs_str.strip()) #rhs = [] #def parse_token(match, rhs=rhs): # token = match.group() # if token[0] in "'\"": rhs.append(token[1:-1]) # else: rhs.append(Nonterminal(token)) # return '' #CFGEditor._TOKEN_RE.sub(parse_token, rhs_str) # #productions.append(Production(lhs, *rhs)) return productions def _destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def _ok(self, *e): self._apply() self._destroy() def _apply(self, *e): productions = self._parse_productions() start = Nonterminal(self._start.get()) cfg = CFG(start, productions) if self._set_cfg_callback is not None: self._set_cfg_callback(cfg) def _reset(self, *e): self._textwidget.delete('1.0', 'end') for production in self._cfg.productions(): self._textwidget.insert('end', '%s\n' % production) self._analyze() if self._set_cfg_callback is not None: self._set_cfg_callback(self._cfg) def _cancel(self, *e): try: self._reset() except: pass self._destroy() def _help(self, *e): # The default font's not very legible; try using 'fixed' instead. try: ShowText(self._parent, 'Help: Chart Parser Demo', (_CFGEditor_HELP).strip(), width=75, font='fixed') except: ShowText(self._parent, 'Help: Chart Parser Demo', (_CFGEditor_HELP).strip(), width=75) ###################################################################### # New Demo (built tree based on cfg) ###################################################################### class CFGDemo(object): def __init__(self, grammar, text): self._grammar = grammar self._text = text # Set up the main window. self._top = Tk() self._top.title('Context Free Grammar Demo') # Base font size self._size = IntVar(self._top) self._size.set(12) # = medium # Set up the key bindings self._init_bindings(self._top) # Create the basic frames frame1 = Frame(self._top) frame1.pack(side='left', fill='y', expand=0) self._init_menubar(self._top) self._init_buttons(self._top) self._init_grammar(frame1) self._init_treelet(frame1) self._init_workspace(self._top) #////////////////////////////////////////////////// # Initialization #////////////////////////////////////////////////// def _init_bindings(self, top): top.bind('', self.destroy) def _init_menubar(self, parent): pass def _init_buttons(self, parent): pass def _init_grammar(self, parent): self._prodlist = ProductionList(parent, self._grammar, width=20) self._prodlist.pack(side='top', fill='both', expand=1) self._prodlist.focus() self._prodlist.add_callback('select', self._selectprod_cb) self._prodlist.add_callback('move', self._selectprod_cb) def _init_treelet(self, parent): self._treelet_canvas = Canvas(parent, background='white') self._treelet_canvas.pack(side='bottom', fill='x') self._treelet = None def _init_workspace(self, parent): self._workspace = CanvasFrame(parent, background='white') self._workspace.pack(side='right', fill='both', expand=1) self._tree = None self.reset_workspace() #////////////////////////////////////////////////// # Workspace #////////////////////////////////////////////////// def reset_workspace(self): c = self._workspace.canvas() fontsize = int(self._size.get()) node_font = ('helvetica', -(fontsize+4), 'bold') leaf_font = ('helvetica', -(fontsize+2)) # Remove the old tree if self._tree is not None: self._workspace.remove_widget(self._tree) # The root of the tree. start = self._grammar.start().symbol() rootnode = TextWidget(c, start, font=node_font, draggable=1) # The leaves of the tree. leaves = [] for word in self._text: leaves.append(TextWidget(c, word, font=leaf_font, draggable=1)) # Put it all together into one tree self._tree = TreeSegmentWidget(c, rootnode, leaves, color='white') # Add it to the workspace. self._workspace.add_widget(self._tree) # Move the leaves to the bottom of the workspace. for leaf in leaves: leaf.move(0,100) #self._nodes = {start:1} #self._leaves = dict([(l,1) for l in leaves]) def workspace_markprod(self, production): pass def _markproduction(self, prod, tree=None): if tree is None: tree = self._tree for i in range(len(tree.subtrees())-len(prod.rhs())): if tree['color', i] == 'white': self._markproduction for j, node in enumerate(prod.rhs()): widget = tree.subtrees()[i+j] if (isinstance(node, Nonterminal) and isinstance(widget, TreeSegmentWidget) and node.symbol == widget.label().text()): pass # matching nonterminal elif (isinstance(node, compat.string_types) and isinstance(widget, TextWidget) and node == widget.text()): pass # matching nonterminal else: break else: # Everything matched! print('MATCH AT', i) #////////////////////////////////////////////////// # Grammar #////////////////////////////////////////////////// def _selectprod_cb(self, production): canvas = self._treelet_canvas self._prodlist.highlight(production) if self._treelet is not None: self._treelet.destroy() # Convert the production to a tree. rhs = production.rhs() for (i, elt) in enumerate(rhs): if isinstance(elt, Nonterminal): elt = Tree(elt) tree = Tree(production.lhs().symbol(), *rhs) # Draw the tree in the treelet area. fontsize = int(self._size.get()) node_font = ('helvetica', -(fontsize+4), 'bold') leaf_font = ('helvetica', -(fontsize+2)) self._treelet = tree_to_treesegment(canvas, tree, node_font=node_font, leaf_font=leaf_font) self._treelet['draggable'] = 1 # Center the treelet. (x1, y1, x2, y2) = self._treelet.bbox() w, h = int(canvas['width']), int(canvas['height']) self._treelet.move((w-x1-x2)/2, (h-y1-y2)/2) # Mark the places where we can add it to the workspace. self._markproduction(production) def destroy(self, *args): self._top.destroy() def mainloop(self, *args, **kwargs): self._top.mainloop(*args, **kwargs) def demo2(): from nltk import Nonterminal, Production, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ['up', 'over', NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) grammar = CFG(S, productions) text = 'I saw a man in the park'.split() d=CFGDemo(grammar, text) d.mainloop() ###################################################################### # Old Demo ###################################################################### def demo(): from nltk import Nonterminal, CFG nonterminals = 'S VP NP PP P N Name V Det' (S, VP, NP, PP, P, N, Name, V, Det) = [Nonterminal(s) for s in nonterminals.split()] grammar = CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N NP -> NP PP VP -> V NP VP -> VP PP Det -> 'a' Det -> 'the' Det -> 'my' NP -> 'I' N -> 'dog' N -> 'man' N -> 'park' N -> 'statue' V -> 'saw' P -> 'in' P -> 'up' P -> 'over' P -> 'with' """) def cb(grammar): print(grammar) top = Tk() editor = CFGEditor(top, grammar, cb) Label(top, text='\nTesting CFG Editor\n').pack() Button(top, text='Quit', command=top.destroy).pack() top.mainloop() def demo3(): from nltk import Production (S, VP, NP, PP, P, N, Name, V, Det) = \ nonterminals('S, VP, NP, PP, P, N, Name, V, Det') productions = ( # Syntactic Productions Production(S, [NP, VP]), Production(NP, [Det, N]), Production(NP, [NP, PP]), Production(VP, [VP, PP]), Production(VP, [V, NP, PP]), Production(VP, [V, NP]), Production(PP, [P, NP]), Production(PP, []), Production(PP, ['up', 'over', NP]), # Lexical Productions Production(NP, ['I']), Production(Det, ['the']), Production(Det, ['a']), Production(N, ['man']), Production(V, ['saw']), Production(P, ['in']), Production(P, ['with']), Production(N, ['park']), Production(N, ['dog']), Production(N, ['statue']), Production(Det, ['my']), ) t = Tk() def destroy(e, t=t): t.destroy() t.bind('q', destroy) p = ProductionList(t, productions) p.pack(expand=1, fill='both') p.add_callback('select', p.markonly) p.add_callback('move', p.markonly) p.focus() p.mark(productions[2]) p.mark(productions[8]) if __name__ == '__main__': demo() nltk-3.1/nltk/draw/dispersion.py0000644000076500000240000000332012607224144016512 0ustar sbstaff00000000000000# Natural Language Toolkit: Dispersion Plots # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ A utility for displaying lexical dispersion. """ def dispersion_plot(text, words, ignore_case=False, title="Lexical Dispersion Plot"): """ Generate a lexical dispersion plot. :param text: The source text :type text: list(str) or enum(str) :param words: The target words :type words: list of str :param ignore_case: flag to set if case should be ignored when searching text :type ignore_case: bool """ try: from matplotlib import pylab except ImportError: raise ValueError('The plot function requires matplotlib to be installed.' 'See http://matplotlib.org/') text = list(text) words.reverse() if ignore_case: words_to_comp = list(map(str.lower, words)) text_to_comp = list(map(str.lower, text)) else: words_to_comp = words text_to_comp = text points = [(x,y) for x in range(len(text_to_comp)) for y in range(len(words_to_comp)) if text_to_comp[x] == words_to_comp[y]] if points: x, y = list(zip(*points)) else: x = y = () pylab.plot(x, y, "b|", scalex=.1) pylab.yticks(list(range(len(words))), words, color="b") pylab.ylim(-1, len(words)) pylab.title(title) pylab.xlabel("Word Offset") pylab.show() if __name__ == '__main__': import nltk.compat from nltk.corpus import gutenberg words = ['Elinor', 'Marianne', 'Edward', 'Willoughby'] dispersion_plot(gutenberg.words('austen-sense.txt'), words) nltk-3.1/nltk/draw/table.py0000644000076500000240000012670512607224144015437 0ustar sbstaff00000000000000# Natural Language Toolkit: Table widget # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Tkinter widgets for displaying multi-column listboxes and tables. """ import nltk.compat import operator from tkinter import (Frame, Label, Listbox, Scrollbar, Tk) ###################################################################### # Multi-Column Listbox ###################################################################### class MultiListbox(Frame): """ A multi-column listbox, where the current selection applies to an entire row. Based on the MultiListbox Tkinter widget recipe from the Python Cookbook (http://code.activestate.com/recipes/52266/) For the most part, ``MultiListbox`` methods delegate to its contained listboxes. For any methods that do not have docstrings, see ``Tkinter.Listbox`` for a description of what that method does. """ #///////////////////////////////////////////////////////////////// # Configuration #///////////////////////////////////////////////////////////////// #: Default configuration values for the frame. FRAME_CONFIG = dict(background='#888', takefocus=True, highlightthickness=1) #: Default configurations for the column labels. LABEL_CONFIG = dict(borderwidth=1, relief='raised', font='helvetica -16 bold', background='#444', foreground='white') #: Default configuration for the column listboxes. LISTBOX_CONFIG = dict(borderwidth=1, selectborderwidth=0, highlightthickness=0, exportselection=False, selectbackground='#888', activestyle='none', takefocus=False) #///////////////////////////////////////////////////////////////// # Constructor #///////////////////////////////////////////////////////////////// def __init__(self, master, columns, column_weights=None, cnf={}, **kw): """ Construct a new multi-column listbox widget. :param master: The widget that should contain the new multi-column listbox. :param columns: Specifies what columns should be included in the new multi-column listbox. If ``columns`` is an integer, the it is the number of columns to include. If it is a list, then its length indicates the number of columns to include; and each element of the list will be used as a label for the corresponding column. :param cnf, kw: Configuration parameters for this widget. Use ``label_*`` to configure all labels; and ``listbox_*`` to configure all listboxes. E.g.: >>> mlb = MultiListbox(master, 5, label_foreground='red') """ # If columns was specified as an int, convert it to a list. if isinstance(columns, int): columns = list(range(columns)) include_labels = False else: include_labels = True if len(columns) == 0: raise ValueError("Expected at least one column") # Instance variables self._column_names = tuple(columns) self._listboxes = [] self._labels = [] # Pick a default value for column_weights, if none was specified. if column_weights is None: column_weights = [1] * len(columns) elif len(column_weights) != len(columns): raise ValueError('Expected one column_weight for each column') self._column_weights = column_weights # Configure our widgets. Frame.__init__(self, master, **self.FRAME_CONFIG) self.grid_rowconfigure(1, weight=1) for i, label in enumerate(self._column_names): self.grid_columnconfigure(i, weight=column_weights[i]) # Create a label for the column if include_labels: l = Label(self, text=label, **self.LABEL_CONFIG) self._labels.append(l) l.grid(column=i, row=0, sticky='news', padx=0, pady=0) l.column_index = i # Create a listbox for the column lb = Listbox(self, **self.LISTBOX_CONFIG) self._listboxes.append(lb) lb.grid(column=i, row=1, sticky='news', padx=0, pady=0) lb.column_index = i # Clicking or dragging selects: lb.bind('', self._select) lb.bind('', self._select) # Scroll whell scrolls: lb.bind('', lambda e: self._scroll(-1)) lb.bind('', lambda e: self._scroll(+1)) lb.bind('', lambda e: self._scroll(e.delta)) # Button 2 can be used to scan: lb.bind('', lambda e: self.scan_mark(e.x, e.y)) lb.bind('', lambda e: self.scan_dragto(e.x, e.y)) # Dragging outside the window has no effect (diable # the default listbox behavior, which scrolls): lb.bind('', lambda e: 'break') # Columns can be resized by dragging them: l.bind('', self._resize_column) # Columns can be resized by dragging them. (This binding is # used if they click on the grid between columns:) self.bind('', self._resize_column) # Set up key bindings for the widget: self.bind('', lambda e: self.select(delta=-1)) self.bind('', lambda e: self.select(delta=1)) self.bind('', lambda e: self.select(delta=-self._pagesize())) self.bind('', lambda e: self.select(delta=self._pagesize())) # Configuration customizations self.configure(cnf, **kw) #///////////////////////////////////////////////////////////////// # Column Resizing #///////////////////////////////////////////////////////////////// def _resize_column(self, event): """ Callback used to resize a column of the table. Return ``True`` if the column is actually getting resized (if the user clicked on the far left or far right 5 pixels of a label); and ``False`` otherwies. """ # If we're already waiting for a button release, then ignore # the new button press. if event.widget.bind(''): return False # Decide which column (if any) to resize. self._resize_column_index = None if event.widget is self: for i, lb in enumerate(self._listboxes): if abs(event.x-(lb.winfo_x()+lb.winfo_width())) < 10: self._resize_column_index = i elif event.x > (event.widget.winfo_width()-5): self._resize_column_index = event.widget.column_index elif event.x < 5 and event.widget.column_index != 0: self._resize_column_index = event.widget.column_index-1 # Bind callbacks that are used to resize it. if self._resize_column_index is not None: event.widget.bind('', self._resize_column_motion_cb) event.widget.bind('' % event.num, self._resize_column_buttonrelease_cb) return True else: return False def _resize_column_motion_cb(self, event): lb = self._listboxes[self._resize_column_index] charwidth = lb.winfo_width() / float(lb['width']) x1 = event.x + event.widget.winfo_x() x2 = lb.winfo_x() + lb.winfo_width() lb['width'] = max(3, lb['width'] + int((x1-x2)/charwidth)) def _resize_column_buttonrelease_cb(self, event): event.widget.unbind('' % event.num) event.widget.unbind('') #///////////////////////////////////////////////////////////////// # Properties #///////////////////////////////////////////////////////////////// @property def column_names(self): """ A tuple containing the names of the columns used by this multi-column listbox. """ return self._column_names @property def column_labels(self): """ A tuple containing the ``Tkinter.Label`` widgets used to display the label of each column. If this multi-column listbox was created without labels, then this will be an empty tuple. These widgets will all be augmented with a ``column_index`` attribute, which can be used to determine which column they correspond to. This can be convenient, e.g., when defining callbacks for bound events. """ return tuple(self._labels) @property def listboxes(self): """ A tuple containing the ``Tkinter.Listbox`` widgets used to display individual columns. These widgets will all be augmented with a ``column_index`` attribute, which can be used to determine which column they correspond to. This can be convenient, e.g., when defining callbacks for bound events. """ return tuple(self._listboxes) #///////////////////////////////////////////////////////////////// # Mouse & Keyboard Callback Functions #///////////////////////////////////////////////////////////////// def _select(self, e): i = e.widget.nearest(e.y) self.selection_clear(0, 'end') self.selection_set(i) self.activate(i) self.focus() def _scroll(self, delta): for lb in self._listboxes: lb.yview_scroll(delta, 'unit') return 'break' def _pagesize(self): """:return: The number of rows that makes up one page""" return int(self.index('@0,1000000')) - int(self.index('@0,0')) #///////////////////////////////////////////////////////////////// # Row selection #///////////////////////////////////////////////////////////////// def select(self, index=None, delta=None, see=True): """ Set the selected row. If ``index`` is specified, then select row ``index``. Otherwise, if ``delta`` is specified, then move the current selection by ``delta`` (negative numbers for up, positive numbers for down). This will not move the selection past the top or the bottom of the list. :param see: If true, then call ``self.see()`` with the newly selected index, to ensure that it is visible. """ if (index is not None) and (delta is not None): raise ValueError('specify index or delta, but not both') # If delta was given, then calculate index. if delta is not None: if len(self.curselection()) == 0: index = -1 + delta else: index = int(self.curselection()[0]) + delta # Clear all selected rows. self.selection_clear(0, 'end') # Select the specified index if index is not None: index = min(max(index, 0), self.size()-1) #self.activate(index) self.selection_set(index) if see: self.see(index) #///////////////////////////////////////////////////////////////// # Configuration #///////////////////////////////////////////////////////////////// def configure(self, cnf={}, **kw): """ Configure this widget. Use ``label_*`` to configure all labels; and ``listbox_*`` to configure all listboxes. E.g.: >>> mlb = MultiListbox(master, 5) >>> mlb.configure(label_foreground='red') >>> mlb.configure(listbox_foreground='red') """ cnf = dict(list(cnf.items()) + list(kw.items())) for (key, val) in list(cnf.items()): if key.startswith('label_') or key.startswith('label-'): for label in self._labels: label.configure({key[6:]: val}) elif key.startswith('listbox_') or key.startswith('listbox-'): for listbox in self._listboxes: listbox.configure({key[8:]: val}) else: Frame.configure(self, {key:val}) def __setitem__(self, key, val): """ Configure this widget. This is equivalent to ``self.configure({key,val``)}. See ``configure()``. """ self.configure({key:val}) def rowconfigure(self, row_index, cnf={}, **kw): """ Configure all table cells in the given row. Valid keyword arguments are: ``background``, ``bg``, ``foreground``, ``fg``, ``selectbackground``, ``selectforeground``. """ for lb in self._listboxes: lb.itemconfigure(row_index, cnf, **kw) def columnconfigure(self, col_index, cnf={}, **kw): """ Configure all table cells in the given column. Valid keyword arguments are: ``background``, ``bg``, ``foreground``, ``fg``, ``selectbackground``, ``selectforeground``. """ lb = self._listboxes[col_index] cnf = dict(list(cnf.items()) + list(kw.items())) for (key, val) in list(cnf.items()): if key in ('background', 'bg', 'foreground', 'fg', 'selectbackground', 'selectforeground'): for i in range(lb.size()): lb.itemconfigure(i, {key:val}) else: lb.configure({key:val}) def itemconfigure(self, row_index, col_index, cnf=None, **kw): """ Configure the table cell at the given row and column. Valid keyword arguments are: ``background``, ``bg``, ``foreground``, ``fg``, ``selectbackground``, ``selectforeground``. """ lb = self._listboxes[col_index] return lb.itemconfigure(row_index, cnf, **kw) #///////////////////////////////////////////////////////////////// # Value Access #///////////////////////////////////////////////////////////////// def insert(self, index, *rows): """ Insert the given row or rows into the table, at the given index. Each row value should be a tuple of cell values, one for each column in the row. Index may be an integer or any of the special strings (such as ``'end'``) accepted by ``Tkinter.Listbox``. """ for elt in rows: if len(elt) != len(self._column_names): raise ValueError('rows should be tuples whose length ' 'is equal to the number of columns') for (lb,elts) in zip(self._listboxes, list(zip(*rows))): lb.insert(index, *elts) def get(self, first, last=None): """ Return the value(s) of the specified row(s). If ``last`` is not specified, then return a single row value; otherwise, return a list of row values. Each row value is a tuple of cell values, one for each column in the row. """ values = [lb.get(first, last) for lb in self._listboxes] if last: return [tuple(row) for row in zip(*values)] else: return tuple(values) def bbox(self, row, col): """ Return the bounding box for the given table cell, relative to this widget's top-left corner. The bounding box is a tuple of integers ``(left, top, width, height)``. """ dx, dy, _, _ = self.grid_bbox(row=0, column=col) x, y, w, h = self._listboxes[col].bbox(row) return int(x)+int(dx), int(y)+int(dy), int(w), int(h) #///////////////////////////////////////////////////////////////// # Hide/Show Columns #///////////////////////////////////////////////////////////////// def hide_column(self, col_index): """ Hide the given column. The column's state is still maintained: its values will still be returned by ``get()``, and you must supply its values when calling ``insert()``. It is safe to call this on a column that is already hidden. :see: ``show_column()`` """ if self._labels: self._labels[col_index].grid_forget() self.listboxes[col_index].grid_forget() self.grid_columnconfigure(col_index, weight=0) def show_column(self, col_index): """ Display a column that has been hidden using ``hide_column()``. It is safe to call this on a column that is not hidden. """ weight = self._column_weights[col_index] if self._labels: self._labels[col_index].grid(column=col_index, row=0, sticky='news', padx=0, pady=0) self._listboxes[col_index].grid(column=col_index, row=1, sticky='news', padx=0, pady=0) self.grid_columnconfigure(col_index, weight=weight) #///////////////////////////////////////////////////////////////// # Binding Methods #///////////////////////////////////////////////////////////////// def bind_to_labels(self, sequence=None, func=None, add=None): """ Add a binding to each ``Tkinter.Label`` widget in this mult-column listbox that will call ``func`` in response to the event sequence. :return: A list of the identifiers of replaced binding functions (if any), allowing for their deletion (to prevent a memory leak). """ return [label.bind(sequence, func, add) for label in self.column_labels] def bind_to_listboxes(self, sequence=None, func=None, add=None): """ Add a binding to each ``Tkinter.Listbox`` widget in this mult-column listbox that will call ``func`` in response to the event sequence. :return: A list of the identifiers of replaced binding functions (if any), allowing for their deletion (to prevent a memory leak). """ for listbox in self.listboxes: listbox.bind(sequence, func, add) def bind_to_columns(self, sequence=None, func=None, add=None): """ Add a binding to each ``Tkinter.Label`` and ``Tkinter.Listbox`` widget in this mult-column listbox that will call ``func`` in response to the event sequence. :return: A list of the identifiers of replaced binding functions (if any), allowing for their deletion (to prevent a memory leak). """ return (self.bind_to_labels(sequence, func, add) + self.bind_to_listboxes(sequence, func, add)) #///////////////////////////////////////////////////////////////// # Simple Delegation #///////////////////////////////////////////////////////////////// # These methods delegate to the first listbox: def curselection(self, *args, **kwargs): return self._listboxes[0].curselection(*args, **kwargs) def selection_includes(self, *args, **kwargs): return self._listboxes[0].selection_includes(*args, **kwargs) def itemcget(self, *args, **kwargs): return self._listboxes[0].itemcget(*args, **kwargs) def size(self, *args, **kwargs): return self._listboxes[0].size(*args, **kwargs) def index(self, *args, **kwargs): return self._listboxes[0].index(*args, **kwargs) def nearest(self, *args, **kwargs): return self._listboxes[0].nearest(*args, **kwargs) # These methods delegate to each listbox (and return None): def activate(self, *args, **kwargs): for lb in self._listboxes: lb.activate(*args, **kwargs) def delete(self, *args, **kwargs): for lb in self._listboxes: lb.delete(*args, **kwargs) def scan_mark(self, *args, **kwargs): for lb in self._listboxes: lb.scan_mark(*args, **kwargs) def scan_dragto(self, *args, **kwargs): for lb in self._listboxes: lb.scan_dragto(*args, **kwargs) def see(self, *args, **kwargs): for lb in self._listboxes: lb.see(*args, **kwargs) def selection_anchor(self, *args, **kwargs): for lb in self._listboxes: lb.selection_anchor(*args, **kwargs) def selection_clear(self, *args, **kwargs): for lb in self._listboxes: lb.selection_clear(*args, **kwargs) def selection_set(self, *args, **kwargs): for lb in self._listboxes: lb.selection_set(*args, **kwargs) def yview(self, *args, **kwargs): for lb in self._listboxes: v = lb.yview(*args, **kwargs) return v # if called with no arguments def yview_moveto(self, *args, **kwargs): for lb in self._listboxes: lb.yview_moveto(*args, **kwargs) def yview_scroll(self, *args, **kwargs): for lb in self._listboxes: lb.yview_scroll(*args, **kwargs) #///////////////////////////////////////////////////////////////// # Aliases #///////////////////////////////////////////////////////////////// itemconfig = itemconfigure rowconfig = rowconfigure columnconfig = columnconfigure select_anchor = selection_anchor select_clear = selection_clear select_includes = selection_includes select_set = selection_set #///////////////////////////////////////////////////////////////// # These listbox methods are not defined for multi-listbox #///////////////////////////////////////////////////////////////// # def xview(self, *what): pass # def xview_moveto(self, fraction): pass # def xview_scroll(self, number, what): pass ###################################################################### # Table ###################################################################### class Table(object): """ A display widget for a table of values, based on a ``MultiListbox`` widget. For many purposes, ``Table`` can be treated as a list-of-lists. E.g., table[i] is a list of the values for row i; and table.append(row) adds a new row with the given lits of values. Individual cells can be accessed using table[i,j], which refers to the j-th column of the i-th row. This can be used to both read and write values from the table. E.g.: >>> table[i,j] = 'hello' The column (j) can be given either as an index number, or as a column name. E.g., the following prints the value in the 3rd row for the 'First Name' column: >>> print(table[3, 'First Name']) John You can configure the colors for individual rows, columns, or cells using ``rowconfig()``, ``columnconfig()``, and ``itemconfig()``. The color configuration for each row will be preserved if the table is modified; however, when new rows are added, any color configurations that have been made for *columns* will not be applied to the new row. Note: Although ``Table`` acts like a widget in some ways (e.g., it defines ``grid()``, ``pack()``, and ``bind()``), it is not itself a widget; it just contains one. This is because widgets need to define ``__getitem__()``, ``__setitem__()``, and ``__nonzero__()`` in a way that's incompatible with the fact that ``Table`` behaves as a list-of-lists. :ivar _mlb: The multi-column listbox used to display this table's data. :ivar _rows: A list-of-lists used to hold the cell values of this table. Each element of _rows is a row value, i.e., a list of cell values, one for each column in the row. """ def __init__(self, master, column_names, rows=None, column_weights=None, scrollbar=True, click_to_sort=True, reprfunc=None, cnf={}, **kw): """ Construct a new Table widget. :type master: Tkinter.Widget :param master: The widget that should contain the new table. :type column_names: list(str) :param column_names: A list of names for the columns; these names will be used to create labels for each column; and can be used as an index when reading or writing cell values from the table. :type rows: list(list) :param rows: A list of row values used to initialze the table. Each row value should be a tuple of cell values, one for each column in the row. :type scrollbar: bool :param scrollbar: If true, then create a scrollbar for the new table widget. :type click_to_sort: bool :param click_to_sort: If true, then create bindings that will sort the table's rows by a given column's values if the user clicks on that colum's label. :type reprfunc: function :param reprfunc: If specified, then use this function to convert each table cell value to a string suitable for display. ``reprfunc`` has the following signature: reprfunc(row_index, col_index, cell_value) -> str (Note that the column is specified by index, not by name.) :param cnf, kw: Configuration parameters for this widget's contained ``MultiListbox``. See ``MultiListbox.__init__()`` for details. """ self._num_columns = len(column_names) self._reprfunc = reprfunc self._frame = Frame(master) self._column_name_to_index = dict((c,i) for (i,c) in enumerate(column_names)) # Make a copy of the rows & check that it's valid. if rows is None: self._rows = [] else: self._rows = [[v for v in row] for row in rows] for row in self._rows: self._checkrow(row) # Create our multi-list box. self._mlb = MultiListbox(self._frame, column_names, column_weights, cnf, **kw) self._mlb.pack(side='left', expand=True, fill='both') # Optional scrollbar if scrollbar: sb = Scrollbar(self._frame, orient='vertical', command=self._mlb.yview) self._mlb.listboxes[0]['yscrollcommand'] = sb.set #for listbox in self._mlb.listboxes: # listbox['yscrollcommand'] = sb.set sb.pack(side='right', fill='y') self._scrollbar = sb # Set up sorting self._sortkey = None if click_to_sort: for i, l in enumerate(self._mlb.column_labels): l.bind('', self._sort) # Fill in our multi-list box. self._fill_table() #///////////////////////////////////////////////////////////////// #{ Widget-like Methods #///////////////////////////////////////////////////////////////// # These all just delegate to either our frame or our MLB. def pack(self, *args, **kwargs): """Position this table's main frame widget in its parent widget. See ``Tkinter.Frame.pack()`` for more info.""" self._frame.pack(*args, **kwargs) def grid(self, *args, **kwargs): """Position this table's main frame widget in its parent widget. See ``Tkinter.Frame.grid()`` for more info.""" self._frame.grid(*args, **kwargs) def focus(self): """Direct (keyboard) input foxus to this widget.""" self._mlb.focus() def bind(self, sequence=None, func=None, add=None): """Add a binding to this table's main frame that will call ``func`` in response to the event sequence.""" self._mlb.bind(sequence, func, add) def rowconfigure(self, row_index, cnf={}, **kw): """:see: ``MultiListbox.rowconfigure()``""" self._mlb.rowconfigure(row_index, cnf, **kw) def columnconfigure(self, col_index, cnf={}, **kw): """:see: ``MultiListbox.columnconfigure()``""" col_index = self.column_index(col_index) self._mlb.columnconfigure(col_index, cnf, **kw) def itemconfigure(self, row_index, col_index, cnf=None, **kw): """:see: ``MultiListbox.itemconfigure()``""" col_index = self.column_index(col_index) return self._mlb.itemconfigure(row_index, col_index, cnf, **kw) def bind_to_labels(self, sequence=None, func=None, add=None): """:see: ``MultiListbox.bind_to_labels()``""" return self._mlb.bind_to_labels(sequence, func, add) def bind_to_listboxes(self, sequence=None, func=None, add=None): """:see: ``MultiListbox.bind_to_listboxes()``""" return self._mlb.bind_to_listboxes(sequence, func, add) def bind_to_columns(self, sequence=None, func=None, add=None): """:see: ``MultiListbox.bind_to_columns()``""" return self._mlb.bind_to_columns(sequence, func, add) rowconfig = rowconfigure columnconfig = columnconfigure itemconfig = itemconfigure #///////////////////////////////////////////////////////////////// #{ Table as list-of-lists #///////////////////////////////////////////////////////////////// def insert(self, row_index, rowvalue): """ Insert a new row into the table, so that its row index will be ``row_index``. If the table contains any rows whose row index is greater than or equal to ``row_index``, then they will be shifted down. :param rowvalue: A tuple of cell values, one for each column in the new row. """ self._checkrow(rowvalue) self._rows.insert(row_index, rowvalue) if self._reprfunc is not None: rowvalue = [self._reprfunc(row_index,j,v) for (j,v) in enumerate(rowvalue)] self._mlb.insert(row_index, rowvalue) if self._DEBUG: self._check_table_vs_mlb() def extend(self, rowvalues): """ Add new rows at the end of the table. :param rowvalues: A list of row values used to initialze the table. Each row value should be a tuple of cell values, one for each column in the row. """ for rowvalue in rowvalues: self.append(rowvalue) if self._DEBUG: self._check_table_vs_mlb() def append(self, rowvalue): """ Add a new row to the end of the table. :param rowvalue: A tuple of cell values, one for each column in the new row. """ self.insert(len(self._rows), rowvalue) if self._DEBUG: self._check_table_vs_mlb() def clear(self): """ Delete all rows in this table. """ self._rows = [] self._mlb.delete(0, 'end') if self._DEBUG: self._check_table_vs_mlb() def __getitem__(self, index): """ Return the value of a row or a cell in this table. If ``index`` is an integer, then the row value for the ``index``th row. This row value consists of a tuple of cell values, one for each column in the row. If ``index`` is a tuple of two integers, ``(i,j)``, then return the value of the cell in the ``i``th row and the ``j``th column. """ if isinstance(index, slice): raise ValueError('Slicing not supported') elif isinstance(index, tuple) and len(index)==2: return self._rows[index[0]][self.column_index(index[1])] else: return tuple(self._rows[index]) def __setitem__(self, index, val): """ Replace the value of a row or a cell in this table with ``val``. If ``index`` is an integer, then ``val`` should be a row value (i.e., a tuple of cell values, one for each column). In this case, the values of the ``index``th row of the table will be replaced with the values in ``val``. If ``index`` is a tuple of integers, ``(i,j)``, then replace the value of the cell in the ``i``th row and ``j``th column with ``val``. """ if isinstance(index, slice): raise ValueError('Slicing not supported') # table[i,j] = val elif isinstance(index, tuple) and len(index)==2: i, j = index[0], self.column_index(index[1]) config_cookie = self._save_config_info([i]) self._rows[i][j] = val if self._reprfunc is not None: val = self._reprfunc(i, j, val) self._mlb.listboxes[j].insert(i, val) self._mlb.listboxes[j].delete(i+1) self._restore_config_info(config_cookie) # table[i] = val else: config_cookie = self._save_config_info([index]) self._checkrow(val) self._rows[index] = list(val) if self._reprfunc is not None: val = [self._reprfunc(index,j,v) for (j,v) in enumerate(val)] self._mlb.insert(index, val) self._mlb.delete(index+1) self._restore_config_info(config_cookie) def __delitem__(self, row_index): """ Delete the ``row_index``th row from this table. """ if isinstance(index, slice): raise ValueError('Slicing not supported') if isinstance(row_index, tuple) and len(row_index)==2: raise ValueError('Cannot delete a single cell!') del self._rows[row_index] self._mlb.delete(row_index) if self._DEBUG: self._check_table_vs_mlb() def __len__(self): """ :return: the number of rows in this table. """ return len(self._rows) def _checkrow(self, rowvalue): """ Helper function: check that a given row value has the correct number of elements; and if not, raise an exception. """ if len(rowvalue) != self._num_columns: raise ValueError('Row %r has %d columns; expected %d' % (rowvalue, len(rowvalue), self._num_columns)) #///////////////////////////////////////////////////////////////// # Columns #///////////////////////////////////////////////////////////////// @property def column_names(self): """A list of the names of the columns in this table.""" return self._mlb.column_names def column_index(self, i): """ If ``i`` is a valid column index integer, then return it as is. Otherwise, check if ``i`` is used as the name for any column; if so, return that column's index. Otherwise, raise a ``KeyError`` exception. """ if isinstance(i, int) and 0 <= i < self._num_columns: return i else: # This raises a key error if the column is not found. return self._column_name_to_index[i] def hide_column(self, column_index): """:see: ``MultiListbox.hide_column()``""" self._mlb.hide_column(self.column_index(column_index)) def show_column(self, column_index): """:see: ``MultiListbox.show_column()``""" self._mlb.show_column(self.column_index(column_index)) #///////////////////////////////////////////////////////////////// # Selection #///////////////////////////////////////////////////////////////// def selected_row(self): """ Return the index of the currently selected row, or None if no row is selected. To get the row value itself, use ``table[table.selected_row()]``. """ sel = self._mlb.curselection() if sel: return int(sel[0]) else: return None def select(self, index=None, delta=None, see=True): """:see: ``MultiListbox.select()``""" self._mlb.select(index, delta, see) #///////////////////////////////////////////////////////////////// # Sorting #///////////////////////////////////////////////////////////////// def sort_by(self, column_index, order='toggle'): """ Sort the rows in this table, using the specified column's values as a sort key. :param column_index: Specifies which column to sort, using either a column index (int) or a column's label name (str). :param order: Specifies whether to sort the values in ascending or descending order: - ``'ascending'``: Sort from least to greatest. - ``'descending'``: Sort from greatest to least. - ``'toggle'``: If the most recent call to ``sort_by()`` sorted the table by the same column (``column_index``), then reverse the rows; otherwise sort in ascending order. """ if order not in ('ascending', 'descending', 'toggle'): raise ValueError('sort_by(): order should be "ascending", ' '"descending", or "toggle".') column_index = self.column_index(column_index) config_cookie = self._save_config_info(index_by_id=True) # Sort the rows. if order == 'toggle' and column_index == self._sortkey: self._rows.reverse() else: self._rows.sort(key=operator.itemgetter(column_index), reverse=(order=='descending')) self._sortkey = column_index # Redraw the table. self._fill_table() self._restore_config_info(config_cookie, index_by_id=True, see=True) if self._DEBUG: self._check_table_vs_mlb() def _sort(self, event): """Event handler for clicking on a column label -- sort by that column.""" column_index = event.widget.column_index # If they click on the far-left of far-right of a column's # label, then resize rather than sorting. if self._mlb._resize_column(event): return 'continue' # Otherwise, sort. else: self.sort_by(column_index) return 'continue' #///////////////////////////////////////////////////////////////// #{ Table Drawing Helpers #///////////////////////////////////////////////////////////////// def _fill_table(self, save_config=True): """ Re-draw the table from scratch, by clearing out the table's multi-column listbox; and then filling it in with values from ``self._rows``. Note that any cell-, row-, or column-specific color configuration that has been done will be lost. The selection will also be lost -- i.e., no row will be selected after this call completes. """ self._mlb.delete(0, 'end') for i, row in enumerate(self._rows): if self._reprfunc is not None: row = [self._reprfunc(i,j,v) for (j,v) in enumerate(row)] self._mlb.insert('end', row) def _get_itemconfig(self, r, c): return dict( (k, self._mlb.itemconfig(r, c, k)[-1]) for k in ('foreground', 'selectforeground', 'background', 'selectbackground') ) def _save_config_info(self, row_indices=None, index_by_id=False): """ Return a 'cookie' containing information about which row is selected, and what color configurations have been applied. this information can the be re-applied to the table (after making modifications) using ``_restore_config_info()``. Color configuration information will be saved for any rows in ``row_indices``, or in the entire table, if ``row_indices=None``. If ``index_by_id=True``, the the cookie will associate rows with their configuration information based on the rows' python id. This is useful when performing operations that re-arrange the rows (e.g. ``sort``). If ``index_by_id=False``, then it is assumed that all rows will be in the same order when ``_restore_config_info()`` is called. """ # Default value for row_indices is all rows. if row_indices is None: row_indices = list(range(len(self._rows))) # Look up our current selection. selection = self.selected_row() if index_by_id and selection is not None: selection = id(self._rows[selection]) # Look up the color configuration info for each row. if index_by_id: config = dict((id(self._rows[r]), [self._get_itemconfig(r, c) for c in range(self._num_columns)]) for r in row_indices) else: config = dict((r, [self._get_itemconfig(r, c) for c in range(self._num_columns)]) for r in row_indices) return selection, config def _restore_config_info(self, cookie, index_by_id=False, see=False): """ Restore selection & color configuration information that was saved using ``_save_config_info``. """ selection, config = cookie # Clear the selection. if selection is None: self._mlb.selection_clear(0, 'end') # Restore selection & color config if index_by_id: for r, row in enumerate(self._rows): if id(row) in config: for c in range(self._num_columns): self._mlb.itemconfigure(r, c, config[id(row)][c]) if id(row) == selection: self._mlb.select(r, see=see) else: if selection is not None: self._mlb.select(selection, see=see) for r in config: for c in range(self._num_columns): self._mlb.itemconfigure(r, c, config[r][c]) #///////////////////////////////////////////////////////////////// # Debugging (Invariant Checker) #///////////////////////////////////////////////////////////////// _DEBUG = False """If true, then run ``_check_table_vs_mlb()`` after any operation that modifies the table.""" def _check_table_vs_mlb(self): """ Verify that the contents of the table's ``_rows`` variable match the contents of its multi-listbox (``_mlb``). This is just included for debugging purposes, to make sure that the list-modifying operations are working correctly. """ for col in self._mlb.listboxes: assert len(self) == col.size() for row in self: assert len(row) == self._num_columns assert self._num_columns == len(self._mlb.column_names) #assert self._column_names == self._mlb.column_names for i, row in enumerate(self): for j, cell in enumerate(row): if self._reprfunc is not None: cell = self._reprfunc(i, j, cell) assert self._mlb.get(i)[j] == cell ###################################################################### # Demo/Test Function ###################################################################### # update this to use new WordNet API def demo(): root = Tk() root.bind('', lambda e: root.destroy()) table = Table(root, 'Word Synset Hypernym Hyponym'.split(), column_weights=[0, 1, 1, 1], reprfunc=(lambda i,j,s: ' %s' % s)) table.pack(expand=True, fill='both') from nltk.corpus import wordnet from nltk.corpus import brown for word, pos in sorted(set(brown.tagged_words()[:500])): if pos[0] != 'N': continue word = word.lower() for synset in wordnet.synsets(word): hyper = (synset.hypernyms()+[''])[0] hypo = (synset.hyponyms()+[''])[0] table.append([word, getattr(synset, 'definition', '*none*'), getattr(hyper, 'definition', '*none*'), getattr(hypo, 'definition', '*none*')]) table.columnconfig('Word', background='#afa') table.columnconfig('Synset', background='#efe') table.columnconfig('Hypernym', background='#fee') table.columnconfig('Hyponym', background='#ffe') for row in range(len(table)): for column in ('Hypernym', 'Hyponym'): if table[row, column] == '*none*': table.itemconfig(row, column, foreground='#666', selectforeground='#666') root.mainloop() if __name__ == '__main__': demo() nltk-3.1/nltk/draw/tree.py0000644000076500000240000011040012607224144015270 0ustar sbstaff00000000000000# Natural Language Toolkit: Graphical Representations for Trees # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Graphically display a Tree. """ import nltk.compat import sys from tkinter import IntVar, Menu, Tk from nltk.util import in_idle from nltk.tree import Tree from nltk.draw.util import (CanvasFrame, CanvasWidget, BoxWidget, TextWidget, ParenWidget, OvalWidget) ##////////////////////////////////////////////////////// ## Tree Segment ##////////////////////////////////////////////////////// class TreeSegmentWidget(CanvasWidget): """ A canvas widget that displays a single segment of a hierarchical tree. Each ``TreeSegmentWidget`` connects a single "node widget" to a sequence of zero or more "subtree widgets". By default, the bottom of the node is connected to the top of each subtree by a single line. However, if the ``roof`` attribute is set, then a single triangular "roof" will connect the node to all of its children. Attributes: - ``roof``: What sort of connection to draw between the node and its subtrees. If ``roof`` is true, draw a single triangular "roof" over the subtrees. If ``roof`` is false, draw a line between each subtree and the node. Default value is false. - ``xspace``: The amount of horizontal space to leave between subtrees when managing this widget. Default value is 10. - ``yspace``: The amount of space to place between the node and its children when managing this widget. Default value is 15. - ``color``: The color of the lines connecting the node to its subtrees; and of the outline of the triangular roof. Default value is ``'#006060'``. - ``fill``: The fill color for the triangular roof. Default value is ``''`` (no fill). - ``width``: The width of the lines connecting the node to its subtrees; and of the outline of the triangular roof. Default value is 1. - ``orientation``: Determines whether the tree branches downwards or rightwards. Possible values are ``'horizontal'`` and ``'vertical'``. The default value is ``'vertical'`` (i.e., branch downwards). - ``draggable``: whether the widget can be dragged by the user. """ def __init__(self, canvas, label, subtrees, **attribs): """ :type node: :type subtrees: list(CanvasWidgetI) """ self._label = label self._subtrees = subtrees # Attributes self._horizontal = 0 self._roof = 0 self._xspace = 10 self._yspace = 15 self._ordered = False # Create canvas objects. self._lines = [canvas.create_line(0,0,0,0, fill='#006060') for c in subtrees] self._polygon = canvas.create_polygon(0,0, fill='', state='hidden', outline='#006060') # Register child widgets (label + subtrees) self._add_child_widget(label) for subtree in subtrees: self._add_child_widget(subtree) # Are we currently managing? self._managing = False CanvasWidget.__init__(self, canvas, **attribs) def __setitem__(self, attr, value): canvas = self.canvas() if attr == 'roof': self._roof = value if self._roof: for l in self._lines: canvas.itemconfig(l, state='hidden') canvas.itemconfig(self._polygon, state='normal') else: for l in self._lines: canvas.itemconfig(l, state='normal') canvas.itemconfig(self._polygon, state='hidden') elif attr == 'orientation': if value == 'horizontal': self._horizontal = 1 elif value == 'vertical': self._horizontal = 0 else: raise ValueError('orientation must be horizontal or vertical') elif attr == 'color': for l in self._lines: canvas.itemconfig(l, fill=value) canvas.itemconfig(self._polygon, outline=value) elif isinstance(attr, tuple) and attr[0] == 'color': # Set the color of an individual line. l = self._lines[int(attr[1])] canvas.itemconfig(l, fill=value) elif attr == 'fill': canvas.itemconfig(self._polygon, fill=value) elif attr == 'width': canvas.itemconfig(self._polygon, {attr:value}) for l in self._lines: canvas.itemconfig(l, {attr:value}) elif attr in ('xspace', 'yspace'): if attr == 'xspace': self._xspace = value elif attr == 'yspace': self._yspace = value self.update(self._label) elif attr == 'ordered': self._ordered = value else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == 'roof': return self._roof elif attr == 'width': return self.canvas().itemcget(self._polygon, attr) elif attr == 'color': return self.canvas().itemcget(self._polygon, 'outline') elif isinstance(attr, tuple) and attr[0] == 'color': l = self._lines[int(attr[1])] return self.canvas().itemcget(l, 'fill') elif attr == 'xspace': return self._xspace elif attr == 'yspace': return self._yspace elif attr == 'orientation': if self._horizontal: return 'horizontal' else: return 'vertical' elif attr == 'ordered': return self._ordered else: return CanvasWidget.__getitem__(self, attr) def label(self): return self._label def subtrees(self): return self._subtrees[:] def set_label(self, label): """ Set the node label to ``label``. """ self._remove_child_widget(self._label) self._add_child_widget(label) self._label = label self.update(self._label) def replace_child(self, oldchild, newchild): """ Replace the child ``oldchild`` with ``newchild``. """ index = self._subtrees.index(oldchild) self._subtrees[index] = newchild self._remove_child_widget(oldchild) self._add_child_widget(newchild) self.update(newchild) def remove_child(self, child): index = self._subtrees.index(child) del self._subtrees[index] self._remove_child_widget(child) self.canvas().delete(self._lines.pop()) self.update(self._label) def insert_child(self, index, child): canvas = self.canvas() self._subtrees.insert(index, child) self._add_child_widget(child) self._lines.append(canvas.create_line(0,0,0,0, fill='#006060')) self.update(self._label) # but.. lines??? def _tags(self): if self._roof: return [self._polygon] else: return self._lines def _subtree_top(self, child): if isinstance(child, TreeSegmentWidget): bbox = child.label().bbox() else: bbox = child.bbox() if self._horizontal: return (bbox[0], (bbox[1]+bbox[3])/2.0) else: return ((bbox[0]+bbox[2])/2.0, bbox[1]) def _node_bottom(self): bbox = self._label.bbox() if self._horizontal: return (bbox[2], (bbox[1]+bbox[3])/2.0) else: return ((bbox[0]+bbox[2])/2.0, bbox[3]) def _update(self, child): if len(self._subtrees) == 0: return if self._label.bbox() is None: return # [XX] ??? # Which lines need to be redrawn? if child is self._label: need_update = self._subtrees else: need_update = [child] if self._ordered and not self._managing: need_update = self._maintain_order(child) # Update the polygon. (nodex, nodey) = self._node_bottom() (xmin, ymin, xmax, ymax) = self._subtrees[0].bbox() for subtree in self._subtrees[1:]: bbox = subtree.bbox() xmin = min(xmin, bbox[0]) ymin = min(ymin, bbox[1]) xmax = max(xmax, bbox[2]) ymax = max(ymax, bbox[3]) if self._horizontal: self.canvas().coords(self._polygon, nodex, nodey, xmin, ymin, xmin, ymax, nodex, nodey) else: self.canvas().coords(self._polygon, nodex, nodey, xmin, ymin, xmax, ymin, nodex, nodey) # Redraw all lines that need it. for subtree in need_update: (nodex, nodey) = self._node_bottom() line = self._lines[self._subtrees.index(subtree)] (subtreex, subtreey) = self._subtree_top(subtree) self.canvas().coords(line, nodex, nodey, subtreex, subtreey) def _maintain_order(self, child): if self._horizontal: return self._maintain_order_horizontal(child) else: return self._maintain_order_vertical(child) def _maintain_order_vertical(self, child): (left, top, right, bot) = child.bbox() if child is self._label: # Check all the leaves for subtree in self._subtrees: (x1, y1, x2, y2) = subtree.bbox() if bot+self._yspace > y1: subtree.move(0,bot+self._yspace-y1) return self._subtrees else: moved = [child] index = self._subtrees.index(child) # Check leaves to our right. x = right + self._xspace for i in range(index+1, len(self._subtrees)): (x1, y1, x2, y2) = self._subtrees[i].bbox() if x > x1: self._subtrees[i].move(x-x1, 0) x += x2-x1 + self._xspace moved.append(self._subtrees[i]) # Check leaves to our left. x = left - self._xspace for i in range(index-1, -1, -1): (x1, y1, x2, y2) = self._subtrees[i].bbox() if x < x2: self._subtrees[i].move(x-x2, 0) x -= x2-x1 + self._xspace moved.append(self._subtrees[i]) # Check the node (x1, y1, x2, y2) = self._label.bbox() if y2 > top-self._yspace: self._label.move(0, top-self._yspace-y2) moved = self._subtrees # Return a list of the nodes we moved return moved def _maintain_order_horizontal(self, child): (left, top, right, bot) = child.bbox() if child is self._label: # Check all the leaves for subtree in self._subtrees: (x1, y1, x2, y2) = subtree.bbox() if right+self._xspace > x1: subtree.move(right+self._xspace-x1) return self._subtrees else: moved = [child] index = self._subtrees.index(child) # Check leaves below us. y = bot + self._yspace for i in range(index+1, len(self._subtrees)): (x1, y1, x2, y2) = self._subtrees[i].bbox() if y > y1: self._subtrees[i].move(0, y-y1) y += y2-y1 + self._yspace moved.append(self._subtrees[i]) # Check leaves above us y = top - self._yspace for i in range(index-1, -1, -1): (x1, y1, x2, y2) = self._subtrees[i].bbox() if y < y2: self._subtrees[i].move(0, y-y2) y -= y2-y1 + self._yspace moved.append(self._subtrees[i]) # Check the node (x1, y1, x2, y2) = self._label.bbox() if x2 > left-self._xspace: self._label.move(left-self._xspace-x2, 0) moved = self._subtrees # Return a list of the nodes we moved return moved def _manage_horizontal(self): (nodex, nodey) = self._node_bottom() # Put the subtrees in a line. y = 20 for subtree in self._subtrees: subtree_bbox = subtree.bbox() dx = nodex - subtree_bbox[0] + self._xspace dy = y - subtree_bbox[1] subtree.move(dx, dy) y += subtree_bbox[3] - subtree_bbox[1] + self._yspace # Find the center of their tops. center = 0.0 for subtree in self._subtrees: center += self._subtree_top(subtree)[1] center /= len(self._subtrees) # Center the subtrees with the node. for subtree in self._subtrees: subtree.move(0, nodey-center) def _manage_vertical(self): (nodex, nodey) = self._node_bottom() # Put the subtrees in a line. x = 0 for subtree in self._subtrees: subtree_bbox = subtree.bbox() dy = nodey - subtree_bbox[1] + self._yspace dx = x - subtree_bbox[0] subtree.move(dx, dy) x += subtree_bbox[2] - subtree_bbox[0] + self._xspace # Find the center of their tops. center = 0.0 for subtree in self._subtrees: center += self._subtree_top(subtree)[0]/len(self._subtrees) # Center the subtrees with the node. for subtree in self._subtrees: subtree.move(nodex-center, 0) def _manage(self): self._managing = True (nodex, nodey) = self._node_bottom() if len(self._subtrees) == 0: return if self._horizontal: self._manage_horizontal() else: self._manage_vertical() # Update lines to subtrees. for subtree in self._subtrees: self._update(subtree) self._managing = False def __repr__(self): return '[TreeSeg %s: %s]' % (self._label, self._subtrees) def _tree_to_treeseg(canvas, t, make_node, make_leaf, tree_attribs, node_attribs, leaf_attribs, loc_attribs): if isinstance(t, Tree): label = make_node(canvas, t.label(), **node_attribs) subtrees = [_tree_to_treeseg(canvas, child, make_node, make_leaf, tree_attribs, node_attribs, leaf_attribs, loc_attribs) for child in t] return TreeSegmentWidget(canvas, label, subtrees, **tree_attribs) else: return make_leaf(canvas, t, **leaf_attribs) def tree_to_treesegment(canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs): """ Convert a Tree into a ``TreeSegmentWidget``. :param make_node: A ``CanvasWidget`` constructor or a function that creates ``CanvasWidgets``. ``make_node`` is used to convert the Tree's nodes into ``CanvasWidgets``. If no constructor is specified, then ``TextWidget`` will be used. :param make_leaf: A ``CanvasWidget`` constructor or a function that creates ``CanvasWidgets``. ``make_leaf`` is used to convert the Tree's leafs into ``CanvasWidgets``. If no constructor is specified, then ``TextWidget`` will be used. :param attribs: Attributes for the canvas widgets that make up the returned ``TreeSegmentWidget``. Any attribute beginning with ``'tree_'`` will be passed to all ``TreeSegmentWidgets`` (with the ``'tree_'`` prefix removed. Any attribute beginning with ``'node_'`` will be passed to all nodes. Any attribute beginning with ``'leaf_'`` will be passed to all leaves. And any attribute beginning with ``'loc_'`` will be passed to all text locations (for Trees). """ # Process attribs. tree_attribs = {} node_attribs = {} leaf_attribs = {} loc_attribs = {} for (key, value) in list(attribs.items()): if key[:5] == 'tree_': tree_attribs[key[5:]] = value elif key[:5] == 'node_': node_attribs[key[5:]] = value elif key[:5] == 'leaf_': leaf_attribs[key[5:]] = value elif key[:4] == 'loc_': loc_attribs[key[4:]] = value else: raise ValueError('Bad attribute: %s' % key) return _tree_to_treeseg(canvas, t, make_node, make_leaf, tree_attribs, node_attribs, leaf_attribs, loc_attribs) ##////////////////////////////////////////////////////// ## Tree Widget ##////////////////////////////////////////////////////// class TreeWidget(CanvasWidget): """ A canvas widget that displays a single Tree. ``TreeWidget`` manages a group of ``TreeSegmentWidgets`` that are used to display a Tree. Attributes: - ``node_attr``: Sets the attribute ``attr`` on all of the node widgets for this ``TreeWidget``. - ``node_attr``: Sets the attribute ``attr`` on all of the leaf widgets for this ``TreeWidget``. - ``loc_attr``: Sets the attribute ``attr`` on all of the location widgets for this ``TreeWidget`` (if it was built from a Tree). Note that a location widget is a ``TextWidget``. - ``xspace``: The amount of horizontal space to leave between subtrees when managing this widget. Default value is 10. - ``yspace``: The amount of space to place between the node and its children when managing this widget. Default value is 15. - ``line_color``: The color of the lines connecting each expanded node to its subtrees. - ``roof_color``: The color of the outline of the triangular roof for collapsed trees. - ``roof_fill``: The fill color for the triangular roof for collapsed trees. - ``width`` - ``orientation``: Determines whether the tree branches downwards or rightwards. Possible values are ``'horizontal'`` and ``'vertical'``. The default value is ``'vertical'`` (i.e., branch downwards). - ``shapeable``: whether the subtrees can be independently dragged by the user. THIS property simply sets the ``DRAGGABLE`` property on all of the ``TreeWidget``'s tree segments. - ``draggable``: whether the widget can be dragged by the user. """ def __init__(self, canvas, t, make_node=TextWidget, make_leaf=TextWidget, **attribs): # Node & leaf canvas widget constructors self._make_node = make_node self._make_leaf = make_leaf self._tree = t # Attributes. self._nodeattribs = {} self._leafattribs = {} self._locattribs = {'color': '#008000'} self._line_color = '#008080' self._line_width = 1 self._roof_color = '#008080' self._roof_fill = '#c0c0c0' self._shapeable = False self._xspace = 10 self._yspace = 10 self._orientation = 'vertical' self._ordered = False # Build trees. self._keys = {} # treeseg -> key self._expanded_trees = {} self._collapsed_trees = {} self._nodes = [] self._leaves = [] #self._locs = [] self._make_collapsed_trees(canvas, t, ()) self._treeseg = self._make_expanded_tree(canvas, t, ()) self._add_child_widget(self._treeseg) CanvasWidget.__init__(self, canvas, **attribs) def expanded_tree(self, *path_to_tree): """ Return the ``TreeSegmentWidget`` for the specified subtree. :param path_to_tree: A list of indices i1, i2, ..., in, where the desired widget is the widget corresponding to ``tree.children()[i1].children()[i2]....children()[in]``. For the root, the path is ``()``. """ return self._expanded_trees[path_to_tree] def collapsed_tree(self, *path_to_tree): """ Return the ``TreeSegmentWidget`` for the specified subtree. :param path_to_tree: A list of indices i1, i2, ..., in, where the desired widget is the widget corresponding to ``tree.children()[i1].children()[i2]....children()[in]``. For the root, the path is ``()``. """ return self._collapsed_trees[path_to_tree] def bind_click_trees(self, callback, button=1): """ Add a binding to all tree segments. """ for tseg in list(self._expanded_trees.values()): tseg.bind_click(callback, button) for tseg in list(self._collapsed_trees.values()): tseg.bind_click(callback, button) def bind_drag_trees(self, callback, button=1): """ Add a binding to all tree segments. """ for tseg in list(self._expanded_trees.values()): tseg.bind_drag(callback, button) for tseg in list(self._collapsed_trees.values()): tseg.bind_drag(callback, button) def bind_click_leaves(self, callback, button=1): """ Add a binding to all leaves. """ for leaf in self._leaves: leaf.bind_click(callback, button) for leaf in self._leaves: leaf.bind_click(callback, button) def bind_drag_leaves(self, callback, button=1): """ Add a binding to all leaves. """ for leaf in self._leaves: leaf.bind_drag(callback, button) for leaf in self._leaves: leaf.bind_drag(callback, button) def bind_click_nodes(self, callback, button=1): """ Add a binding to all nodes. """ for node in self._nodes: node.bind_click(callback, button) for node in self._nodes: node.bind_click(callback, button) def bind_drag_nodes(self, callback, button=1): """ Add a binding to all nodes. """ for node in self._nodes: node.bind_drag(callback, button) for node in self._nodes: node.bind_drag(callback, button) def _make_collapsed_trees(self, canvas, t, key): if not isinstance(t, Tree): return make_node = self._make_node make_leaf = self._make_leaf node = make_node(canvas, t.label(), **self._nodeattribs) self._nodes.append(node) leaves = [make_leaf(canvas, l, **self._leafattribs) for l in t.leaves()] self._leaves += leaves treeseg = TreeSegmentWidget(canvas, node, leaves, roof=1, color=self._roof_color, fill=self._roof_fill, width=self._line_width) self._collapsed_trees[key] = treeseg self._keys[treeseg] = key #self._add_child_widget(treeseg) treeseg.hide() # Build trees for children. for i in range(len(t)): child = t[i] self._make_collapsed_trees(canvas, child, key + (i,)) def _make_expanded_tree(self, canvas, t, key): make_node = self._make_node make_leaf = self._make_leaf if isinstance(t, Tree): node = make_node(canvas, t.label(), **self._nodeattribs) self._nodes.append(node) children = t subtrees = [self._make_expanded_tree(canvas, children[i], key+(i,)) for i in range(len(children))] treeseg = TreeSegmentWidget(canvas, node, subtrees, color=self._line_color, width=self._line_width) self._expanded_trees[key] = treeseg self._keys[treeseg] = key return treeseg else: leaf = make_leaf(canvas, t, **self._leafattribs) self._leaves.append(leaf) return leaf def __setitem__(self, attr, value): if attr[:5] == 'node_': for node in self._nodes: node[attr[5:]] = value elif attr[:5] == 'leaf_': for leaf in self._leaves: leaf[attr[5:]] = value elif attr == 'line_color': self._line_color = value for tseg in list(self._expanded_trees.values()): tseg['color'] = value elif attr == 'line_width': self._line_width = value for tseg in list(self._expanded_trees.values()): tseg['width'] = value for tseg in list(self._collapsed_trees.values()): tseg['width'] = value elif attr == 'roof_color': self._roof_color = value for tseg in list(self._collapsed_trees.values()): tseg['color'] = value elif attr == 'roof_fill': self._roof_fill = value for tseg in list(self._collapsed_trees.values()): tseg['fill'] = value elif attr == 'shapeable': self._shapeable = value for tseg in list(self._expanded_trees.values()): tseg['draggable'] = value for tseg in list(self._collapsed_trees.values()): tseg['draggable'] = value for leaf in self._leaves: leaf['draggable'] = value elif attr == 'xspace': self._xspace = value for tseg in list(self._expanded_trees.values()): tseg['xspace'] = value for tseg in list(self._collapsed_trees.values()): tseg['xspace'] = value self.manage() elif attr == 'yspace': self._yspace = value for tseg in list(self._expanded_trees.values()): tseg['yspace'] = value for tseg in list(self._collapsed_trees.values()): tseg['yspace'] = value self.manage() elif attr == 'orientation': self._orientation = value for tseg in list(self._expanded_trees.values()): tseg['orientation'] = value for tseg in list(self._collapsed_trees.values()): tseg['orientation'] = value self.manage() elif attr == 'ordered': self._ordered = value for tseg in list(self._expanded_trees.values()): tseg['ordered'] = value for tseg in list(self._collapsed_trees.values()): tseg['ordered'] = value else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr[:5] == 'node_': return self._nodeattribs.get(attr[5:], None) elif attr[:5] == 'leaf_': return self._leafattribs.get(attr[5:], None) elif attr[:4] == 'loc_': return self._locattribs.get(attr[4:], None) elif attr == 'line_color': return self._line_color elif attr == 'line_width': return self._line_width elif attr == 'roof_color': return self._roof_color elif attr == 'roof_fill': return self._roof_fill elif attr == 'shapeable': return self._shapeable elif attr == 'xspace': return self._xspace elif attr == 'yspace': return self._yspace elif attr == 'orientation': return self._orientation else: return CanvasWidget.__getitem__(self, attr) def _tags(self): return [] def _manage(self): segs = list(self._expanded_trees.values()) + list(self._collapsed_trees.values()) for tseg in segs: if tseg.hidden(): tseg.show() tseg.manage() tseg.hide() def toggle_collapsed(self, treeseg): """ Collapse/expand a tree. """ old_treeseg = treeseg if old_treeseg['roof']: new_treeseg = self._expanded_trees[self._keys[old_treeseg]] else: new_treeseg = self._collapsed_trees[self._keys[old_treeseg]] # Replace the old tree with the new tree. if old_treeseg.parent() is self: self._remove_child_widget(old_treeseg) self._add_child_widget(new_treeseg) self._treeseg = new_treeseg else: old_treeseg.parent().replace_child(old_treeseg, new_treeseg) # Move the new tree to where the old tree was. Show it first, # so we can find its bounding box. new_treeseg.show() (newx, newy) = new_treeseg.label().bbox()[:2] (oldx, oldy) = old_treeseg.label().bbox()[:2] new_treeseg.move(oldx-newx, oldy-newy) # Hide the old tree old_treeseg.hide() # We could do parent.manage() here instead, if we wanted. new_treeseg.parent().update(new_treeseg) ##////////////////////////////////////////////////////// ## draw_trees ##////////////////////////////////////////////////////// class TreeView(object): def __init__(self, *trees): from math import sqrt, ceil self._trees = trees self._top = Tk() self._top.title('NLTK') self._top.bind('', self.destroy) self._top.bind('', self.destroy) cf = self._cframe = CanvasFrame(self._top) self._top.bind('', self._cframe.print_to_file) # Size is variable. self._size = IntVar(self._top) self._size.set(12) bold = ('helvetica', -self._size.get(), 'bold') helv = ('helvetica', -self._size.get()) # Lay the trees out in a square. self._width = int(ceil(sqrt(len(trees)))) self._widgets = [] for i in range(len(trees)): widget = TreeWidget(cf.canvas(), trees[i], node_font=bold, leaf_color='#008040', node_color='#004080', roof_color='#004040', roof_fill='white', line_color='#004040', draggable=1, leaf_font=helv) widget.bind_click_trees(widget.toggle_collapsed) self._widgets.append(widget) cf.add_widget(widget, 0, 0) self._layout() self._cframe.pack(expand=1, fill='both') self._init_menubar() def _layout(self): i = x = y = ymax = 0 width = self._width for i in range(len(self._widgets)): widget = self._widgets[i] (oldx, oldy) = widget.bbox()[:2] if i % width == 0: y = ymax x = 0 widget.move(x-oldx, y-oldy) x = widget.bbox()[2] + 10 ymax = max(ymax, widget.bbox()[3] + 10) def _init_menubar(self): menubar = Menu(self._top) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label='Print to Postscript', underline=0, command=self._cframe.print_to_file, accelerator='Ctrl-p') filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x') menubar.add_cascade(label='File', underline=0, menu=filemenu) zoommenu = Menu(menubar, tearoff=0) zoommenu.add_radiobutton(label='Tiny', variable=self._size, underline=0, value=10, command=self.resize) zoommenu.add_radiobutton(label='Small', variable=self._size, underline=0, value=12, command=self.resize) zoommenu.add_radiobutton(label='Medium', variable=self._size, underline=0, value=14, command=self.resize) zoommenu.add_radiobutton(label='Large', variable=self._size, underline=0, value=28, command=self.resize) zoommenu.add_radiobutton(label='Huge', variable=self._size, underline=0, value=50, command=self.resize) menubar.add_cascade(label='Zoom', underline=0, menu=zoommenu) self._top.config(menu=menubar) def resize(self, *e): bold = ('helvetica', -self._size.get(), 'bold') helv = ('helvetica', -self._size.get()) xspace = self._size.get() yspace = self._size.get() for widget in self._widgets: widget['node_font'] = bold widget['leaf_font'] = helv widget['xspace'] = xspace widget['yspace'] = yspace if self._size.get() < 20: widget['line_width'] = 1 elif self._size.get() < 30: widget['line_width'] = 2 else: widget['line_width'] = 3 self._layout() def destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) def draw_trees(*trees): """ Open a new window containing a graphical diagram of the given trees. :rtype: None """ TreeView(*trees).mainloop() return ##////////////////////////////////////////////////////// ## Demo Code ##////////////////////////////////////////////////////// def demo(): import random def fill(cw): cw['fill'] = '#%06d' % random.randint(0,999999) cf = CanvasFrame(width=550, height=450, closeenough=2) t = Tree.fromstring(''' (S (NP the very big cat) (VP (Adv sorta) (V saw) (NP (Det the) (N dog))))''') tc = TreeWidget(cf.canvas(), t, draggable=1, node_font=('helvetica', -14, 'bold'), leaf_font=('helvetica', -12, 'italic'), roof_fill='white', roof_color='black', leaf_color='green4', node_color='blue2') cf.add_widget(tc,10,10) def boxit(canvas, text): big = ('helvetica', -16, 'bold') return BoxWidget(canvas, TextWidget(canvas, text, font=big), fill='green') def ovalit(canvas, text): return OvalWidget(canvas, TextWidget(canvas, text), fill='cyan') treetok = Tree.fromstring('(S (NP this tree) (VP (V is) (AdjP shapeable)))') tc2 = TreeWidget(cf.canvas(), treetok, boxit, ovalit, shapeable=1) def color(node): node['color'] = '#%04d00' % random.randint(0,9999) def color2(treeseg): treeseg.label()['fill'] = '#%06d' % random.randint(0,9999) treeseg.label().child()['color'] = 'white' tc.bind_click_trees(tc.toggle_collapsed) tc2.bind_click_trees(tc2.toggle_collapsed) tc.bind_click_nodes(color, 3) tc2.expanded_tree(1).bind_click(color2, 3) tc2.expanded_tree().bind_click(color2, 3) paren = ParenWidget(cf.canvas(), tc2) cf.add_widget(paren, tc.bbox()[2]+10, 10) tree3 = Tree.fromstring(''' (S (NP this tree) (AUX was) (VP (V built) (PP (P with) (NP (N tree_to_treesegment)))))''') tc3 = tree_to_treesegment(cf.canvas(), tree3, tree_color='green4', tree_xspace=2, tree_width=2) tc3['draggable'] = 1 cf.add_widget(tc3, 10, tc.bbox()[3]+10) def orientswitch(treewidget): if treewidget['orientation'] == 'horizontal': treewidget.expanded_tree(1,1).subtrees()[0].set_text('vertical') treewidget.collapsed_tree(1,1).subtrees()[0].set_text('vertical') treewidget.collapsed_tree(1).subtrees()[1].set_text('vertical') treewidget.collapsed_tree().subtrees()[3].set_text('vertical') treewidget['orientation'] = 'vertical' else: treewidget.expanded_tree(1,1).subtrees()[0].set_text('horizontal') treewidget.collapsed_tree(1,1).subtrees()[0].set_text('horizontal') treewidget.collapsed_tree(1).subtrees()[1].set_text('horizontal') treewidget.collapsed_tree().subtrees()[3].set_text('horizontal') treewidget['orientation'] = 'horizontal' text = """ Try clicking, right clicking, and dragging different elements of each of the trees. The top-left tree is a TreeWidget built from a Tree. The top-right is a TreeWidget built from a Tree, using non-default widget constructors for the nodes & leaves (BoxWidget and OvalWidget). The bottom-left tree is built from tree_to_treesegment.""" twidget = TextWidget(cf.canvas(), text.strip()) textbox = BoxWidget(cf.canvas(), twidget, fill='white', draggable=1) cf.add_widget(textbox, tc3.bbox()[2]+10, tc2.bbox()[3]+10) tree4 = Tree.fromstring('(S (NP this tree) (VP (V is) (Adj horizontal)))') tc4 = TreeWidget(cf.canvas(), tree4, draggable=1, line_color='brown2', roof_color='brown2', node_font=('helvetica', -12, 'bold'), node_color='brown4', orientation='horizontal') tc4.manage() cf.add_widget(tc4, tc3.bbox()[2]+10, textbox.bbox()[3]+10) tc4.bind_click(orientswitch) tc4.bind_click_trees(tc4.toggle_collapsed, 3) # Run mainloop cf.mainloop() if __name__ == '__main__': demo() nltk-3.1/nltk/draw/util.py0000644000076500000240000025100612607224144015316 0ustar sbstaff00000000000000# Natural Language Toolkit: Drawing utilities # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Tools for graphically displaying and interacting with the objects and processing classes defined by the Toolkit. These tools are primarily intended to help students visualize the objects that they create. The graphical tools are typically built using "canvas widgets", each of which encapsulates the graphical elements and bindings used to display a complex object on a Tkinter ``Canvas``. For example, NLTK defines canvas widgets for displaying trees and directed graphs, as well as a number of simpler widgets. These canvas widgets make it easier to build new graphical tools and demos. See the class documentation for ``CanvasWidget`` for more information. The ``nltk.draw`` module defines the abstract ``CanvasWidget`` base class, and a number of simple canvas widgets. The remaining canvas widgets are defined by submodules, such as ``nltk.draw.tree``. The ``nltk.draw`` module also defines ``CanvasFrame``, which encapsulates a ``Canvas`` and its scrollbars. It uses a ``ScrollWatcherWidget`` to ensure that all canvas widgets contained on its canvas are within the scroll region. Acknowledgements: Many of the ideas behind the canvas widget system are derived from ``CLIG``, a Tk-based grapher for linguistic data structures. For more information, see the CLIG homepage (http://www.ags.uni-sb.de/~konrad/clig.html). """ import nltk.compat from tkinter import (Button, Canvas, Entry, Frame, Label, Menu, Menubutton, RAISED, Scrollbar, StringVar, Text, Tk, Toplevel, Widget) import tkinter.font, tkinter.messagebox, tkinter.filedialog from nltk.util import in_idle ##////////////////////////////////////////////////////// ## CanvasWidget ##////////////////////////////////////////////////////// class CanvasWidget(object): """ A collection of graphical elements and bindings used to display a complex object on a Tkinter ``Canvas``. A canvas widget is responsible for managing the ``Canvas`` tags and callback bindings necessary to display and interact with the object. Canvas widgets are often organized into hierarchies, where parent canvas widgets control aspects of their child widgets. Each canvas widget is bound to a single ``Canvas``. This ``Canvas`` is specified as the first argument to the ``CanvasWidget``'s constructor. Attributes. Each canvas widget can support a variety of "attributes", which control how the canvas widget is displayed. Some typical examples attributes are ``color``, ``font``, and ``radius``. Each attribute has a default value. This default value can be overridden in the constructor, using keyword arguments of the form ``attribute=value``: >>> from nltk.draw.util import TextWidget >>> cn = TextWidget(c, 'test', color='red') Attribute values can also be changed after a canvas widget has been constructed, using the ``__setitem__`` operator: >>> cn['font'] = 'times' The current value of an attribute value can be queried using the ``__getitem__`` operator: >>> cn['color'] red For a list of the attributes supported by a type of canvas widget, see its class documentation. Interaction. The attribute ``'draggable'`` controls whether the user can drag a canvas widget around the canvas. By default, canvas widgets are not draggable. ``CanvasWidget`` provides callback support for two types of user interaction: clicking and dragging. The method ``bind_click`` registers a callback function that is called whenever the canvas widget is clicked. The method ``bind_drag`` registers a callback function that is called after the canvas widget is dragged. If the user clicks or drags a canvas widget with no registered callback function, then the interaction event will propagate to its parent. For each canvas widget, only one callback function may be registered for an interaction event. Callback functions can be deregistered with the ``unbind_click`` and ``unbind_drag`` methods. Subclassing. ``CanvasWidget`` is an abstract class. Subclasses are required to implement the following methods: - ``__init__``: Builds a new canvas widget. It must perform the following three tasks (in order): - Create any new graphical elements. - Call ``_add_child_widget`` on each child widget. - Call the ``CanvasWidget`` constructor. - ``_tags``: Returns a list of the canvas tags for all graphical elements managed by this canvas widget, not including graphical elements managed by its child widgets. - ``_manage``: Arranges the child widgets of this canvas widget. This is typically only called when the canvas widget is created. - ``_update``: Update this canvas widget in response to a change in a single child. For a ``CanvasWidget`` with no child widgets, the default definitions for ``_manage`` and ``_update`` may be used. If a subclass defines any attributes, then it should implement ``__getitem__`` and ``__setitem__``. If either of these methods is called with an unknown attribute, then they should propagate the request to ``CanvasWidget``. Most subclasses implement a number of additional methods that modify the ``CanvasWidget`` in some way. These methods must call ``parent.update(self)`` after making any changes to the canvas widget's graphical elements. The canvas widget must also call ``parent.update(self)`` after changing any attribute value that affects the shape or position of the canvas widget's graphical elements. :type __canvas: Tkinter.Canvas :ivar __canvas: This ``CanvasWidget``'s canvas. :type __parent: CanvasWidget or None :ivar __parent: This ``CanvasWidget``'s hierarchical parent widget. :type __children: list(CanvasWidget) :ivar __children: This ``CanvasWidget``'s hierarchical child widgets. :type __updating: bool :ivar __updating: Is this canvas widget currently performing an update? If it is, then it will ignore any new update requests from child widgets. :type __draggable: bool :ivar __draggable: Is this canvas widget draggable? :type __press: event :ivar __press: The ButtonPress event that we're currently handling. :type __drag_x: int :ivar __drag_x: Where it's been moved to (to find dx) :type __drag_y: int :ivar __drag_y: Where it's been moved to (to find dy) :type __callbacks: dictionary :ivar __callbacks: Registered callbacks. Currently, four keys are used: ``1``, ``2``, ``3``, and ``'drag'``. The values are callback functions. Each callback function takes a single argument, which is the ``CanvasWidget`` that triggered the callback. """ def __init__(self, canvas, parent=None, **attribs): """ Create a new canvas widget. This constructor should only be called by subclass constructors; and it should be called only "after" the subclass has constructed all graphical canvas objects and registered all child widgets. :param canvas: This canvas widget's canvas. :type canvas: Tkinter.Canvas :param parent: This canvas widget's hierarchical parent. :type parent: CanvasWidget :param attribs: The new canvas widget's attributes. """ if self.__class__ == CanvasWidget: raise TypeError('CanvasWidget is an abstract base class') if not isinstance(canvas, Canvas): raise TypeError('Expected a canvas!') self.__canvas = canvas self.__parent = parent # If the subclass constructor called _add_child_widget, then # self.__children will already exist. if not hasattr(self, '_CanvasWidget__children'): self.__children = [] # Is this widget hidden? self.__hidden = 0 # Update control (prevents infinite loops) self.__updating = 0 # Button-press and drag callback handling. self.__press = None self.__drag_x = self.__drag_y = 0 self.__callbacks = {} self.__draggable = 0 # Set up attributes. for (attr, value) in list(attribs.items()): self[attr] = value # Manage this canvas widget self._manage() # Register any new bindings for tag in self._tags(): self.__canvas.tag_bind(tag, '', self.__press_cb) self.__canvas.tag_bind(tag, '', self.__press_cb) self.__canvas.tag_bind(tag, '', self.__press_cb) ##////////////////////////////////////////////////////// ## Inherited methods. ##////////////////////////////////////////////////////// def bbox(self): """ :return: A bounding box for this ``CanvasWidget``. The bounding box is a tuple of four coordinates, *(xmin, ymin, xmax, ymax)*, for a rectangle which encloses all of the canvas widget's graphical elements. Bounding box coordinates are specified with respect to the coordinate space of the ``Canvas``. :rtype: tuple(int, int, int, int) """ if self.__hidden: return (0,0,0,0) if len(self.tags()) == 0: raise ValueError('No tags') return self.__canvas.bbox(*self.tags()) def width(self): """ :return: The width of this canvas widget's bounding box, in its ``Canvas``'s coordinate space. :rtype: int """ if len(self.tags()) == 0: raise ValueError('No tags') bbox = self.__canvas.bbox(*self.tags()) return bbox[2]-bbox[0] def height(self): """ :return: The height of this canvas widget's bounding box, in its ``Canvas``'s coordinate space. :rtype: int """ if len(self.tags()) == 0: raise ValueError('No tags') bbox = self.__canvas.bbox(*self.tags()) return bbox[3]-bbox[1] def parent(self): """ :return: The hierarchical parent of this canvas widget. ``self`` is considered a subpart of its parent for purposes of user interaction. :rtype: CanvasWidget or None """ return self.__parent def child_widgets(self): """ :return: A list of the hierarchical children of this canvas widget. These children are considered part of ``self`` for purposes of user interaction. :rtype: list of CanvasWidget """ return self.__children def canvas(self): """ :return: The canvas that this canvas widget is bound to. :rtype: Tkinter.Canvas """ return self.__canvas def move(self, dx, dy): """ Move this canvas widget by a given distance. In particular, shift the canvas widget right by ``dx`` pixels, and down by ``dy`` pixels. Both ``dx`` and ``dy`` may be negative, resulting in leftward or upward movement. :type dx: int :param dx: The number of pixels to move this canvas widget rightwards. :type dy: int :param dy: The number of pixels to move this canvas widget downwards. :rtype: None """ if dx == dy == 0: return for tag in self.tags(): self.__canvas.move(tag, dx, dy) if self.__parent: self.__parent.update(self) def moveto(self, x, y, anchor='NW'): """ Move this canvas widget to the given location. In particular, shift the canvas widget such that the corner or side of the bounding box specified by ``anchor`` is at location (``x``, ``y``). :param x,y: The location that the canvas widget should be moved to. :param anchor: The corner or side of the canvas widget that should be moved to the specified location. ``'N'`` specifies the top center; ``'NE'`` specifies the top right corner; etc. """ x1,y1,x2,y2 = self.bbox() if anchor == 'NW': self.move(x-x1, y-y1) if anchor == 'N': self.move(x-x1/2-x2/2, y-y1) if anchor == 'NE': self.move(x-x2, y-y1) if anchor == 'E': self.move(x-x2, y-y1/2-y2/2) if anchor == 'SE': self.move(x-x2, y-y2) if anchor == 'S': self.move(x-x1/2-x2/2, y-y2) if anchor == 'SW': self.move(x-x1, y-y2) if anchor == 'W': self.move(x-x1, y-y1/2-y2/2) def destroy(self): """ Remove this ``CanvasWidget`` from its ``Canvas``. After a ``CanvasWidget`` has been destroyed, it should not be accessed. Note that you only need to destroy a top-level ``CanvasWidget``; its child widgets will be destroyed automatically. If you destroy a non-top-level ``CanvasWidget``, then the entire top-level widget will be destroyed. :raise ValueError: if this ``CanvasWidget`` has a parent. :rtype: None """ if self.__parent is not None: self.__parent.destroy() return for tag in self.tags(): self.__canvas.tag_unbind(tag, '') self.__canvas.tag_unbind(tag, '') self.__canvas.tag_unbind(tag, '') self.__canvas.delete(*self.tags()) self.__canvas = None def update(self, child): """ Update the graphical display of this canvas widget, and all of its ancestors, in response to a change in one of this canvas widget's children. :param child: The child widget that changed. :type child: CanvasWidget """ if self.__hidden or child.__hidden: return # If we're already updating, then do nothing. This prevents # infinite loops when _update modifies its children. if self.__updating: return self.__updating = 1 # Update this CanvasWidget. self._update(child) # Propagate update request to the parent. if self.__parent: self.__parent.update(self) # We're done updating. self.__updating = 0 def manage(self): """ Arrange this canvas widget and all of its descendants. :rtype: None """ if self.__hidden: return for child in self.__children: child.manage() self._manage() def tags(self): """ :return: a list of the canvas tags for all graphical elements managed by this canvas widget, including graphical elements managed by its child widgets. :rtype: list of int """ if self.__canvas is None: raise ValueError('Attempt to access a destroyed canvas widget') tags = [] tags += self._tags() for child in self.__children: tags += child.tags() return tags def __setitem__(self, attr, value): """ Set the value of the attribute ``attr`` to ``value``. See the class documentation for a list of attributes supported by this canvas widget. :rtype: None """ if attr == 'draggable': self.__draggable = value else: raise ValueError('Unknown attribute %r' % attr) def __getitem__(self, attr): """ :return: the value of the attribute ``attr``. See the class documentation for a list of attributes supported by this canvas widget. :rtype: (any) """ if attr == 'draggable': return self.__draggable else: raise ValueError('Unknown attribute %r' % attr) def __repr__(self): """ :return: a string representation of this canvas widget. :rtype: str """ return '<%s>' % self.__class__.__name__ def hide(self): """ Temporarily hide this canvas widget. :rtype: None """ self.__hidden = 1 for tag in self.tags(): self.__canvas.itemconfig(tag, state='hidden') def show(self): """ Show a hidden canvas widget. :rtype: None """ self.__hidden = 0 for tag in self.tags(): self.__canvas.itemconfig(tag, state='normal') def hidden(self): """ :return: True if this canvas widget is hidden. :rtype: bool """ return self.__hidden ##////////////////////////////////////////////////////// ## Callback interface ##////////////////////////////////////////////////////// def bind_click(self, callback, button=1): """ Register a new callback that will be called whenever this ``CanvasWidget`` is clicked on. :type callback: function :param callback: The callback function that will be called whenever this ``CanvasWidget`` is clicked. This function will be called with this ``CanvasWidget`` as its argument. :type button: int :param button: Which button the user should use to click on this ``CanvasWidget``. Typically, this should be 1 (left button), 3 (right button), or 2 (middle button). """ self.__callbacks[button] = callback def bind_drag(self, callback): """ Register a new callback that will be called after this ``CanvasWidget`` is dragged. This implicitly makes this ``CanvasWidget`` draggable. :type callback: function :param callback: The callback function that will be called whenever this ``CanvasWidget`` is clicked. This function will be called with this ``CanvasWidget`` as its argument. """ self.__draggable = 1 self.__callbacks['drag'] = callback def unbind_click(self, button=1): """ Remove a callback that was registered with ``bind_click``. :type button: int :param button: Which button the user should use to click on this ``CanvasWidget``. Typically, this should be 1 (left button), 3 (right button), or 2 (middle button). """ try: del self.__callbacks[button] except: pass def unbind_drag(self): """ Remove a callback that was registered with ``bind_drag``. """ try: del self.__callbacks['drag'] except: pass ##////////////////////////////////////////////////////// ## Callback internals ##////////////////////////////////////////////////////// def __press_cb(self, event): """ Handle a button-press event: - record the button press event in ``self.__press`` - register a button-release callback. - if this CanvasWidget or any of its ancestors are draggable, then register the appropriate motion callback. """ # If we're already waiting for a button release, then ignore # this new button press. if (self.__canvas.bind('') or self.__canvas.bind('') or self.__canvas.bind('')): return # Unbind motion (just in case; this shouldn't be necessary) self.__canvas.unbind('') # Record the button press event. self.__press = event # If any ancestor is draggable, set up a motion callback. # (Only if they pressed button number 1) if event.num == 1: widget = self while widget is not None: if widget['draggable']: widget.__start_drag(event) break widget = widget.parent() # Set up the button release callback. self.__canvas.bind('' % event.num, self.__release_cb) def __start_drag(self, event): """ Begin dragging this object: - register a motion callback - record the drag coordinates """ self.__canvas.bind('', self.__motion_cb) self.__drag_x = event.x self.__drag_y = event.y def __motion_cb(self, event): """ Handle a motion event: - move this object to the new location - record the new drag coordinates """ self.move(event.x-self.__drag_x, event.y-self.__drag_y) self.__drag_x = event.x self.__drag_y = event.y def __release_cb(self, event): """ Handle a release callback: - unregister motion & button release callbacks. - decide whether they clicked, dragged, or cancelled - call the appropriate handler. """ # Unbind the button release & motion callbacks. self.__canvas.unbind('' % event.num) self.__canvas.unbind('') # Is it a click or a drag? if (event.time - self.__press.time < 100 and abs(event.x-self.__press.x) + abs(event.y-self.__press.y) < 5): # Move it back, if we were dragging. if self.__draggable and event.num == 1: self.move(self.__press.x - self.__drag_x, self.__press.y - self.__drag_y) self.__click(event.num) elif event.num == 1: self.__drag() self.__press = None def __drag(self): """ If this ``CanvasWidget`` has a drag callback, then call it; otherwise, find the closest ancestor with a drag callback, and call it. If no ancestors have a drag callback, do nothing. """ if self.__draggable: if 'drag' in self.__callbacks: cb = self.__callbacks['drag'] try: cb(self) except: print('Error in drag callback for %r' % self) elif self.__parent is not None: self.__parent.__drag() def __click(self, button): """ If this ``CanvasWidget`` has a drag callback, then call it; otherwise, find the closest ancestor with a click callback, and call it. If no ancestors have a click callback, do nothing. """ if button in self.__callbacks: cb = self.__callbacks[button] #try: cb(self) #except: # print 'Error in click callback for %r' % self # raise elif self.__parent is not None: self.__parent.__click(button) ##////////////////////////////////////////////////////// ## Child/parent Handling ##////////////////////////////////////////////////////// def _add_child_widget(self, child): """ Register a hierarchical child widget. The child will be considered part of this canvas widget for purposes of user interaction. ``_add_child_widget`` has two direct effects: - It sets ``child``'s parent to this canvas widget. - It adds ``child`` to the list of canvas widgets returned by the ``child_widgets`` member function. :param child: The new child widget. ``child`` must not already have a parent. :type child: CanvasWidget """ if not hasattr(self, '_CanvasWidget__children'): self.__children = [] if child.__parent is not None: raise ValueError('%s already has a parent', child) child.__parent = self self.__children.append(child) def _remove_child_widget(self, child): """ Remove a hierarchical child widget. This child will no longer be considered part of this canvas widget for purposes of user interaction. ``_add_child_widget`` has two direct effects: - It sets ``child``'s parent to None. - It removes ``child`` from the list of canvas widgets returned by the ``child_widgets`` member function. :param child: The child widget to remove. ``child`` must be a child of this canvas widget. :type child: CanvasWidget """ self.__children.remove(child) child.__parent = None ##////////////////////////////////////////////////////// ## Defined by subclass ##////////////////////////////////////////////////////// def _tags(self): """ :return: a list of canvas tags for all graphical elements managed by this canvas widget, not including graphical elements managed by its child widgets. :rtype: list of int """ raise NotImplementedError() def _manage(self): """ Arrange the child widgets of this canvas widget. This method is called when the canvas widget is initially created. It is also called if the user calls the ``manage`` method on this canvas widget or any of its ancestors. :rtype: None """ pass def _update(self, child): """ Update this canvas widget in response to a change in one of its children. :param child: The child that changed. :type child: CanvasWidget :rtype: None """ pass ##////////////////////////////////////////////////////// ## Basic widgets. ##////////////////////////////////////////////////////// class TextWidget(CanvasWidget): """ A canvas widget that displays a single string of text. Attributes: - ``color``: the color of the text. - ``font``: the font used to display the text. - ``justify``: justification for multi-line texts. Valid values are ``left``, ``center``, and ``right``. - ``width``: the width of the text. If the text is wider than this width, it will be line-wrapped at whitespace. - ``draggable``: whether the text can be dragged by the user. """ def __init__(self, canvas, text, **attribs): """ Create a new text widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :type text: str :param text: The string of text to display. :param attribs: The new canvas widget's attributes. """ self._text = text self._tag = canvas.create_text(1, 1, text=text) CanvasWidget.__init__(self, canvas, **attribs) def __setitem__(self, attr, value): if attr in ('color', 'font', 'justify', 'width'): if attr == 'color': attr = 'fill' self.canvas().itemconfig(self._tag, {attr:value}) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == 'width': return int(self.canvas().itemcget(self._tag, attr)) elif attr in ('color', 'font', 'justify'): if attr == 'color': attr = 'fill' return self.canvas().itemcget(self._tag, attr) else: return CanvasWidget.__getitem__(self, attr) def _tags(self): return [self._tag] def text(self): """ :return: The text displayed by this text widget. :rtype: str """ return self.canvas().itemcget(self._tag, 'TEXT') def set_text(self, text): """ Change the text that is displayed by this text widget. :type text: str :param text: The string of text to display. :rtype: None """ self.canvas().itemconfig(self._tag, text=text) if self.parent() is not None: self.parent().update(self) def __repr__(self): return '[Text: %r]' % self._text class SymbolWidget(TextWidget): """ A canvas widget that displays special symbols, such as the negation sign and the exists operator. Symbols are specified by name. Currently, the following symbol names are defined: ``neg``, ``disj``, ``conj``, ``lambda``, ``merge``, ``forall``, ``exists``, ``subseteq``, ``subset``, ``notsubset``, ``emptyset``, ``imp``, ``rightarrow``, ``equal``, ``notequal``, ``epsilon``. Attributes: - ``color``: the color of the text. - ``draggable``: whether the text can be dragged by the user. :cvar SYMBOLS: A dictionary mapping from symbols to the character in the ``symbol`` font used to render them. """ SYMBOLS = {'neg':'\330', 'disj':'\332', 'conj': '\331', 'lambda': '\154', 'merge': '\304', 'forall': '\042', 'exists': '\044', 'subseteq': '\315', 'subset': '\314', 'notsubset': '\313', 'emptyset': '\306', 'imp': '\336', 'rightarrow': chr(222), #'\256', 'equal': '\75', 'notequal': '\271', 'intersection': '\307', 'union': '\310', 'epsilon': 'e', } def __init__(self, canvas, symbol, **attribs): """ Create a new symbol widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :type symbol: str :param symbol: The name of the symbol to display. :param attribs: The new canvas widget's attributes. """ attribs['font'] = 'symbol' TextWidget.__init__(self, canvas, '', **attribs) self.set_symbol(symbol) def symbol(self): """ :return: the name of the symbol that is displayed by this symbol widget. :rtype: str """ return self._symbol def set_symbol(self, symbol): """ Change the symbol that is displayed by this symbol widget. :type symbol: str :param symbol: The name of the symbol to display. """ if symbol not in SymbolWidget.SYMBOLS: raise ValueError('Unknown symbol: %s' % symbol) self._symbol = symbol self.set_text(SymbolWidget.SYMBOLS[symbol]) def __repr__(self): return '[Symbol: %r]' % self._symbol @staticmethod def symbolsheet(size=20): """ Open a new Tkinter window that displays the entire alphabet for the symbol font. This is useful for constructing the ``SymbolWidget.SYMBOLS`` dictionary. """ top = Tk() def destroy(e, top=top): top.destroy() top.bind('q', destroy) Button(top, text='Quit', command=top.destroy).pack(side='bottom') text = Text(top, font=('helvetica', -size), width=20, height=30) text.pack(side='left') sb=Scrollbar(top, command=text.yview) text['yscrollcommand']=sb.set sb.pack(side='right', fill='y') text.tag_config('symbol', font=('symbol', -size)) for i in range(256): if i in (0,10): continue # null and newline for k,v in list(SymbolWidget.SYMBOLS.items()): if v == chr(i): text.insert('end', '%-10s\t' % k) break else: text.insert('end', '%-10d \t' % i) text.insert('end', '[%s]\n' % chr(i), 'symbol') top.mainloop() class AbstractContainerWidget(CanvasWidget): """ An abstract class for canvas widgets that contain a single child, such as ``BoxWidget`` and ``OvalWidget``. Subclasses must define a constructor, which should create any new graphical elements and then call the ``AbstractCanvasContainer`` constructor. Subclasses must also define the ``_update`` method and the ``_tags`` method; and any subclasses that define attributes should define ``__setitem__`` and ``__getitem__``. """ def __init__(self, canvas, child, **attribs): """ Create a new container widget. This constructor should only be called by subclass constructors. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The container's child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._add_child_widget(child) CanvasWidget.__init__(self, canvas, **attribs) def _manage(self): self._update(self._child) def child(self): """ :return: The child widget contained by this container widget. :rtype: CanvasWidget """ return self._child def set_child(self, child): """ Change the child widget contained by this container widget. :param child: The new child widget. ``child`` must not have a parent. :type child: CanvasWidget :rtype: None """ self._remove_child_widget(self._child) self._add_child_widget(child) self._child = child self.update(child) def __repr__(self): name = self.__class__.__name__ if name[-6:] == 'Widget': name = name[:-6] return '[%s: %r]' % (name, self._child) class BoxWidget(AbstractContainerWidget): """ A canvas widget that places a box around a child widget. Attributes: - ``fill``: The color used to fill the interior of the box. - ``outline``: The color used to draw the outline of the box. - ``width``: The width of the outline of the box. - ``margin``: The number of pixels space left between the child and the box. - ``draggable``: whether the text can be dragged by the user. """ def __init__(self, canvas, child, **attribs): """ Create a new box widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._margin = 1 self._box = canvas.create_rectangle(1,1,1,1) canvas.tag_lower(self._box) AbstractContainerWidget.__init__(self, canvas, child, **attribs) def __setitem__(self, attr, value): if attr == 'margin': self._margin = value elif attr in ('outline', 'fill', 'width'): self.canvas().itemconfig(self._box, {attr:value}) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == 'margin': return self._margin elif attr == 'width': return float(self.canvas().itemcget(self._box, attr)) elif attr in ('outline', 'fill', 'width'): return self.canvas().itemcget(self._box, attr) else: return CanvasWidget.__getitem__(self, attr) def _update(self, child): (x1, y1, x2, y2) = child.bbox() margin = self._margin + self['width']/2 self.canvas().coords(self._box, x1-margin, y1-margin, x2+margin, y2+margin) def _tags(self): return [self._box] class OvalWidget(AbstractContainerWidget): """ A canvas widget that places a oval around a child widget. Attributes: - ``fill``: The color used to fill the interior of the oval. - ``outline``: The color used to draw the outline of the oval. - ``width``: The width of the outline of the oval. - ``margin``: The number of pixels space left between the child and the oval. - ``draggable``: whether the text can be dragged by the user. - ``double``: If true, then a double-oval is drawn. """ def __init__(self, canvas, child, **attribs): """ Create a new oval widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._margin = 1 self._oval = canvas.create_oval(1,1,1,1) self._circle = attribs.pop('circle', False) self._double = attribs.pop('double', False) if self._double: self._oval2 = canvas.create_oval(1,1,1,1) else: self._oval2 = None canvas.tag_lower(self._oval) AbstractContainerWidget.__init__(self, canvas, child, **attribs) def __setitem__(self, attr, value): c = self.canvas() if attr == 'margin': self._margin = value elif attr == 'double': if value==True and self._oval2 is None: # Copy attributes & position from self._oval. x1, y1, x2, y2 = c.bbox(self._oval) w = self['width']*2 self._oval2 = c.create_oval(x1-w, y1-w, x2+w, y2+w, outline=c.itemcget(self._oval, 'outline'), width=c.itemcget(self._oval, 'width')) c.tag_lower(self._oval2) if value==False and self._oval2 is not None: c.delete(self._oval2) self._oval2 = None elif attr in ('outline', 'fill', 'width'): c.itemconfig(self._oval, {attr:value}) if self._oval2 is not None and attr!='fill': c.itemconfig(self._oval2, {attr:value}) if self._oval2 is not None and attr!='fill': self.canvas().itemconfig(self._oval2, {attr:value}) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == 'margin': return self._margin elif attr == 'double': return self._double is not None elif attr == 'width': return float(self.canvas().itemcget(self._oval, attr)) elif attr in ('outline', 'fill', 'width'): return self.canvas().itemcget(self._oval, attr) else: return CanvasWidget.__getitem__(self, attr) # The ratio between inscribed & circumscribed ovals RATIO = 1.4142135623730949 def _update(self, child): R = OvalWidget.RATIO (x1, y1, x2, y2) = child.bbox() margin = self._margin # If we're a circle, pretend our contents are square. if self._circle: dx, dy = abs(x1-x2), abs(y1-y2) if dx > dy: y = (y1+y2)/2 y1, y2 = y-dx/2, y+dx/2 elif dy > dx: x = (x1+x2)/2 x1, x2 = x-dy/2, x+dy/2 # Find the four corners. left = int(( x1*(1+R) + x2*(1-R) ) / 2) right = left + int((x2-x1)*R) top = int(( y1*(1+R) + y2*(1-R) ) / 2) bot = top + int((y2-y1)*R) self.canvas().coords(self._oval, left-margin, top-margin, right+margin, bot+margin) if self._oval2 is not None: self.canvas().coords(self._oval2, left-margin+2, top-margin+2, right+margin-2, bot+margin-2) def _tags(self): if self._oval2 is None: return [self._oval] else: return [self._oval, self._oval2] class ParenWidget(AbstractContainerWidget): """ A canvas widget that places a pair of parenthases around a child widget. Attributes: - ``color``: The color used to draw the parenthases. - ``width``: The width of the parenthases. - ``draggable``: whether the text can be dragged by the user. """ def __init__(self, canvas, child, **attribs): """ Create a new parenthasis widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._oparen = canvas.create_arc(1,1,1,1, style='arc', start=90, extent=180) self._cparen = canvas.create_arc(1,1,1,1, style='arc', start=-90, extent=180) AbstractContainerWidget.__init__(self, canvas, child, **attribs) def __setitem__(self, attr, value): if attr == 'color': self.canvas().itemconfig(self._oparen, outline=value) self.canvas().itemconfig(self._cparen, outline=value) elif attr == 'width': self.canvas().itemconfig(self._oparen, width=value) self.canvas().itemconfig(self._cparen, width=value) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == 'color': return self.canvas().itemcget(self._oparen, 'outline') elif attr == 'width': return self.canvas().itemcget(self._oparen, 'width') else: return CanvasWidget.__getitem__(self, attr) def _update(self, child): (x1, y1, x2, y2) = child.bbox() width = max((y2-y1)/6, 4) self.canvas().coords(self._oparen, x1-width, y1, x1+width, y2) self.canvas().coords(self._cparen, x2-width, y1, x2+width, y2) def _tags(self): return [self._oparen, self._cparen] class BracketWidget(AbstractContainerWidget): """ A canvas widget that places a pair of brackets around a child widget. Attributes: - ``color``: The color used to draw the brackets. - ``width``: The width of the brackets. - ``draggable``: whether the text can be dragged by the user. """ def __init__(self, canvas, child, **attribs): """ Create a new bracket widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param child: The child widget. ``child`` must not have a parent. :type child: CanvasWidget :param attribs: The new canvas widget's attributes. """ self._child = child self._obrack = canvas.create_line(1,1,1,1,1,1,1,1) self._cbrack = canvas.create_line(1,1,1,1,1,1,1,1) AbstractContainerWidget.__init__(self, canvas, child, **attribs) def __setitem__(self, attr, value): if attr == 'color': self.canvas().itemconfig(self._obrack, fill=value) self.canvas().itemconfig(self._cbrack, fill=value) elif attr == 'width': self.canvas().itemconfig(self._obrack, width=value) self.canvas().itemconfig(self._cbrack, width=value) else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == 'color': return self.canvas().itemcget(self._obrack, 'outline') elif attr == 'width': return self.canvas().itemcget(self._obrack, 'width') else: return CanvasWidget.__getitem__(self, attr) def _update(self, child): (x1, y1, x2, y2) = child.bbox() width = max((y2-y1)/8, 2) self.canvas().coords(self._obrack, x1, y1, x1-width, y1, x1-width, y2, x1, y2) self.canvas().coords(self._cbrack, x2, y1, x2+width, y1, x2+width, y2, x2, y2) def _tags(self): return [self._obrack, self._cbrack] class SequenceWidget(CanvasWidget): """ A canvas widget that keeps a list of canvas widgets in a horizontal line. Attributes: - ``align``: The vertical alignment of the children. Possible values are ``'top'``, ``'center'``, and ``'bottom'``. By default, children are center-aligned. - ``space``: The amount of horizontal space to place between children. By default, one pixel of space is used. - ``ordered``: If true, then keep the children in their original order. """ def __init__(self, canvas, *children, **attribs): """ Create a new sequence widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param children: The widgets that should be aligned horizontally. Each child must not have a parent. :type children: list(CanvasWidget) :param attribs: The new canvas widget's attributes. """ self._align = 'center' self._space = 1 self._ordered = False self._children = list(children) for child in children: self._add_child_widget(child) CanvasWidget.__init__(self, canvas, **attribs) def __setitem__(self, attr, value): if attr == 'align': if value not in ('top', 'bottom', 'center'): raise ValueError('Bad alignment: %r' % value) self._align = value elif attr == 'space': self._space = value elif attr == 'ordered': self._ordered = value else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == 'align': return self._align elif attr == 'space': return self._space elif attr == 'ordered': return self._ordered else: return CanvasWidget.__getitem__(self, attr) def _tags(self): return [] def _yalign(self, top, bot): if self._align == 'top': return top if self._align == 'bottom': return bot if self._align == 'center': return (top+bot)/2 def _update(self, child): # Align all children with child. (left, top, right, bot) = child.bbox() y = self._yalign(top, bot) for c in self._children: (x1, y1, x2, y2) = c.bbox() c.move(0, y-self._yalign(y1,y2)) if self._ordered and len(self._children) > 1: index = self._children.index(child) x = right + self._space for i in range(index+1, len(self._children)): (x1, y1, x2, y2) = self._children[i].bbox() if x > x1: self._children[i].move(x-x1, 0) x += x2-x1 + self._space x = left - self._space for i in range(index-1, -1, -1): (x1, y1, x2, y2) = self._children[i].bbox() if x < x2: self._children[i].move(x-x2, 0) x -= x2-x1 + self._space def _manage(self): if len(self._children) == 0: return child = self._children[0] # Align all children with child. (left, top, right, bot) = child.bbox() y = self._yalign(top, bot) index = self._children.index(child) # Line up children to the right of child. x = right + self._space for i in range(index+1, len(self._children)): (x1, y1, x2, y2) = self._children[i].bbox() self._children[i].move(x-x1, y-self._yalign(y1,y2)) x += x2-x1 + self._space # Line up children to the left of child. x = left - self._space for i in range(index-1, -1, -1): (x1, y1, x2, y2) = self._children[i].bbox() self._children[i].move(x-x2, y-self._yalign(y1,y2)) x -= x2-x1 + self._space def __repr__(self): return '[Sequence: ' + repr(self._children)[1:-1]+']' # Provide an alias for the child_widgets() member. children = CanvasWidget.child_widgets def replace_child(self, oldchild, newchild): """ Replace the child canvas widget ``oldchild`` with ``newchild``. ``newchild`` must not have a parent. ``oldchild``'s parent will be set to None. :type oldchild: CanvasWidget :param oldchild: The child canvas widget to remove. :type newchild: CanvasWidget :param newchild: The canvas widget that should replace ``oldchild``. """ index = self._children.index(oldchild) self._children[index] = newchild self._remove_child_widget(oldchild) self._add_child_widget(newchild) self.update(newchild) def remove_child(self, child): """ Remove the given child canvas widget. ``child``'s parent will be set ot None. :type child: CanvasWidget :param child: The child canvas widget to remove. """ index = self._children.index(child) del self._children[index] self._remove_child_widget(child) if len(self._children) > 0: self.update(self._children[0]) def insert_child(self, index, child): """ Insert a child canvas widget before a given index. :type child: CanvasWidget :param child: The canvas widget that should be inserted. :type index: int :param index: The index where the child widget should be inserted. In particular, the index of ``child`` will be ``index``; and the index of any children whose indices were greater than equal to ``index`` before ``child`` was inserted will be incremented by one. """ self._children.insert(index, child) self._add_child_widget(child) class StackWidget(CanvasWidget): """ A canvas widget that keeps a list of canvas widgets in a vertical line. Attributes: - ``align``: The horizontal alignment of the children. Possible values are ``'left'``, ``'center'``, and ``'right'``. By default, children are center-aligned. - ``space``: The amount of vertical space to place between children. By default, one pixel of space is used. - ``ordered``: If true, then keep the children in their original order. """ def __init__(self, canvas, *children, **attribs): """ Create a new stack widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :param children: The widgets that should be aligned vertically. Each child must not have a parent. :type children: list(CanvasWidget) :param attribs: The new canvas widget's attributes. """ self._align = 'center' self._space = 1 self._ordered = False self._children = list(children) for child in children: self._add_child_widget(child) CanvasWidget.__init__(self, canvas, **attribs) def __setitem__(self, attr, value): if attr == 'align': if value not in ('left', 'right', 'center'): raise ValueError('Bad alignment: %r' % value) self._align = value elif attr == 'space': self._space = value elif attr == 'ordered': self._ordered = value else: CanvasWidget.__setitem__(self, attr, value) def __getitem__(self, attr): if attr == 'align': return self._align elif attr == 'space': return self._space elif attr == 'ordered': return self._ordered else: return CanvasWidget.__getitem__(self, attr) def _tags(self): return [] def _xalign(self, left, right): if self._align == 'left': return left if self._align == 'right': return right if self._align == 'center': return (left+right)/2 def _update(self, child): # Align all children with child. (left, top, right, bot) = child.bbox() x = self._xalign(left, right) for c in self._children: (x1, y1, x2, y2) = c.bbox() c.move(x-self._xalign(x1,x2), 0) if self._ordered and len(self._children) > 1: index = self._children.index(child) y = bot + self._space for i in range(index+1, len(self._children)): (x1, y1, x2, y2) = self._children[i].bbox() if y > y1: self._children[i].move(0, y-y1) y += y2-y1 + self._space y = top - self._space for i in range(index-1, -1, -1): (x1, y1, x2, y2) = self._children[i].bbox() if y < y2: self._children[i].move(0, y-y2) y -= y2-y1 + self._space def _manage(self): if len(self._children) == 0: return child = self._children[0] # Align all children with child. (left, top, right, bot) = child.bbox() x = self._xalign(left, right) index = self._children.index(child) # Line up children below the child. y = bot + self._space for i in range(index+1, len(self._children)): (x1, y1, x2, y2) = self._children[i].bbox() self._children[i].move(x-self._xalign(x1,x2), y-y1) y += y2-y1 + self._space # Line up children above the child. y = top - self._space for i in range(index-1, -1, -1): (x1, y1, x2, y2) = self._children[i].bbox() self._children[i].move(x-self._xalign(x1,x2), y-y2) y -= y2-y1 + self._space def __repr__(self): return '[Stack: ' + repr(self._children)[1:-1]+']' # Provide an alias for the child_widgets() member. children = CanvasWidget.child_widgets def replace_child(self, oldchild, newchild): """ Replace the child canvas widget ``oldchild`` with ``newchild``. ``newchild`` must not have a parent. ``oldchild``'s parent will be set to None. :type oldchild: CanvasWidget :param oldchild: The child canvas widget to remove. :type newchild: CanvasWidget :param newchild: The canvas widget that should replace ``oldchild``. """ index = self._children.index(oldchild) self._children[index] = newchild self._remove_child_widget(oldchild) self._add_child_widget(newchild) self.update(newchild) def remove_child(self, child): """ Remove the given child canvas widget. ``child``'s parent will be set ot None. :type child: CanvasWidget :param child: The child canvas widget to remove. """ index = self._children.index(child) del self._children[index] self._remove_child_widget(child) if len(self._children) > 0: self.update(self._children[0]) def insert_child(self, index, child): """ Insert a child canvas widget before a given index. :type child: CanvasWidget :param child: The canvas widget that should be inserted. :type index: int :param index: The index where the child widget should be inserted. In particular, the index of ``child`` will be ``index``; and the index of any children whose indices were greater than equal to ``index`` before ``child`` was inserted will be incremented by one. """ self._children.insert(index, child) self._add_child_widget(child) class SpaceWidget(CanvasWidget): """ A canvas widget that takes up space but does not display anything. A ``SpaceWidget`` can be used to add space between elements. Each space widget is characterized by a width and a height. If you wish to only create horizontal space, then use a height of zero; and if you wish to only create vertical space, use a width of zero. """ def __init__(self, canvas, width, height, **attribs): """ Create a new space widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :type width: int :param width: The width of the new space widget. :type height: int :param height: The height of the new space widget. :param attribs: The new canvas widget's attributes. """ # For some reason, if width > 4: width -= 4 if height > 4: height -= 4 self._tag = canvas.create_line(1, 1, width, height, fill='') CanvasWidget.__init__(self, canvas, **attribs) # note: width() and height() are already defined by CanvasWidget. def set_width(self, width): """ Change the width of this space widget. :param width: The new width. :type width: int :rtype: None """ [x1, y1, x2, y2] = self.bbox() self.canvas().coords(self._tag, x1, y1, x1+width, y2) def set_height(self, height): """ Change the height of this space widget. :param height: The new height. :type height: int :rtype: None """ [x1, y1, x2, y2] = self.bbox() self.canvas().coords(self._tag, x1, y1, x2, y1+height) def _tags(self): return [self._tag] def __repr__(self): return '[Space]' class ScrollWatcherWidget(CanvasWidget): """ A special canvas widget that adjusts its ``Canvas``'s scrollregion to always include the bounding boxes of all of its children. The scroll-watcher widget will only increase the size of the ``Canvas``'s scrollregion; it will never decrease it. """ def __init__(self, canvas, *children, **attribs): """ Create a new scroll-watcher widget. :type canvas: Tkinter.Canvas :param canvas: This canvas widget's canvas. :type children: list(CanvasWidget) :param children: The canvas widgets watched by the scroll-watcher. The scroll-watcher will ensure that these canvas widgets are always contained in their canvas's scrollregion. :param attribs: The new canvas widget's attributes. """ for child in children: self._add_child_widget(child) CanvasWidget.__init__(self, canvas, **attribs) def add_child(self, canvaswidget): """ Add a new canvas widget to the scroll-watcher. The scroll-watcher will ensure that the new canvas widget is always contained in its canvas's scrollregion. :param canvaswidget: The new canvas widget. :type canvaswidget: CanvasWidget :rtype: None """ self._add_child_widget(canvaswidget) self.update(canvaswidget) def remove_child(self, canvaswidget): """ Remove a canvas widget from the scroll-watcher. The scroll-watcher will no longer ensure that the new canvas widget is always contained in its canvas's scrollregion. :param canvaswidget: The canvas widget to remove. :type canvaswidget: CanvasWidget :rtype: None """ self._remove_child_widget(canvaswidget) def _tags(self): return [] def _update(self, child): self._adjust_scrollregion() def _adjust_scrollregion(self): """ Adjust the scrollregion of this scroll-watcher's ``Canvas`` to include the bounding boxes of all of its children. """ bbox = self.bbox() canvas = self.canvas() scrollregion = [int(n) for n in canvas['scrollregion'].split()] if len(scrollregion) != 4: return if (bbox[0] < scrollregion[0] or bbox[1] < scrollregion[1] or bbox[2] > scrollregion[2] or bbox[3] > scrollregion[3]): scrollregion = ('%d %d %d %d' % (min(bbox[0], scrollregion[0]), min(bbox[1], scrollregion[1]), max(bbox[2], scrollregion[2]), max(bbox[3], scrollregion[3]))) canvas['scrollregion'] = scrollregion ##////////////////////////////////////////////////////// ## Canvas Frame ##////////////////////////////////////////////////////// class CanvasFrame(object): """ A ``Tkinter`` frame containing a canvas and scrollbars. ``CanvasFrame`` uses a ``ScrollWatcherWidget`` to ensure that all of the canvas widgets contained on its canvas are within its scrollregion. In order for ``CanvasFrame`` to make these checks, all canvas widgets must be registered with ``add_widget`` when they are added to the canvas; and destroyed with ``destroy_widget`` when they are no longer needed. If a ``CanvasFrame`` is created with no parent, then it will create its own main window, including a "Done" button and a "Print" button. """ def __init__(self, parent=None, **kw): """ Create a new ``CanvasFrame``. :type parent: Tkinter.BaseWidget or Tkinter.Tk :param parent: The parent ``Tkinter`` widget. If no parent is specified, then ``CanvasFrame`` will create a new main window. :param kw: Keyword arguments for the new ``Canvas``. See the documentation for ``Tkinter.Canvas`` for more information. """ # If no parent was given, set up a top-level window. if parent is None: self._parent = Tk() self._parent.title('NLTK') self._parent.bind('', lambda e: self.print_to_file()) self._parent.bind('', self.destroy) self._parent.bind('', self.destroy) else: self._parent = parent # Create a frame for the canvas & scrollbars self._frame = frame = Frame(self._parent) self._canvas = canvas = Canvas(frame, **kw) xscrollbar = Scrollbar(self._frame, orient='horizontal') yscrollbar = Scrollbar(self._frame, orient='vertical') xscrollbar['command'] = canvas.xview yscrollbar['command'] = canvas.yview canvas['xscrollcommand'] = xscrollbar.set canvas['yscrollcommand'] = yscrollbar.set yscrollbar.pack(fill='y', side='right') xscrollbar.pack(fill='x', side='bottom') canvas.pack(expand=1, fill='both', side='left') # Set initial scroll region. scrollregion = '0 0 %s %s' % (canvas['width'], canvas['height']) canvas['scrollregion'] = scrollregion self._scrollwatcher = ScrollWatcherWidget(canvas) # If no parent was given, pack the frame, and add a menu. if parent is None: self.pack(expand=1, fill='both') self._init_menubar() def _init_menubar(self): menubar = Menu(self._parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label='Print to Postscript', underline=0, command=self.print_to_file, accelerator='Ctrl-p') filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='Ctrl-x') menubar.add_cascade(label='File', underline=0, menu=filemenu) self._parent.config(menu=menubar) def print_to_file(self, filename=None): """ Print the contents of this ``CanvasFrame`` to a postscript file. If no filename is given, then prompt the user for one. :param filename: The name of the file to print the tree to. :type filename: str :rtype: None """ if filename is None: from tkinter.filedialog import asksaveasfilename ftypes = [('Postscript files', '.ps'), ('All files', '*')] filename = asksaveasfilename(filetypes=ftypes, defaultextension='.ps') if not filename: return (x0, y0, w, h) = self.scrollregion() self._canvas.postscript(file=filename, x=x0, y=y0, width=w+2, height=h+2, pagewidth=w+2, # points = 1/72 inch pageheight=h+2, # points = 1/72 inch pagex=0, pagey=0) def scrollregion(self): """ :return: The current scroll region for the canvas managed by this ``CanvasFrame``. :rtype: 4-tuple of int """ (x1, y1, x2, y2) = self._canvas['scrollregion'].split() return (int(x1), int(y1), int(x2), int(y2)) def canvas(self): """ :return: The canvas managed by this ``CanvasFrame``. :rtype: Tkinter.Canvas """ return self._canvas def add_widget(self, canvaswidget, x=None, y=None): """ Register a canvas widget with this ``CanvasFrame``. The ``CanvasFrame`` will ensure that this canvas widget is always within the ``Canvas``'s scrollregion. If no coordinates are given for the canvas widget, then the ``CanvasFrame`` will attempt to find a clear area of the canvas for it. :type canvaswidget: CanvasWidget :param canvaswidget: The new canvas widget. ``canvaswidget`` must have been created on this ``CanvasFrame``'s canvas. :type x: int :param x: The initial x coordinate for the upper left hand corner of ``canvaswidget``, in the canvas's coordinate space. :type y: int :param y: The initial y coordinate for the upper left hand corner of ``canvaswidget``, in the canvas's coordinate space. """ if x is None or y is None: (x, y) = self._find_room(canvaswidget, x, y) # Move to (x,y) (x1,y1,x2,y2) = canvaswidget.bbox() canvaswidget.move(x-x1,y-y1) # Register with scrollwatcher. self._scrollwatcher.add_child(canvaswidget) def _find_room(self, widget, desired_x, desired_y): """ Try to find a space for a given widget. """ (left, top, right, bot) = self.scrollregion() w = widget.width() h = widget.height() if w >= (right-left): return (0,0) if h >= (bot-top): return (0,0) # Move the widget out of the way, for now. (x1,y1,x2,y2) = widget.bbox() widget.move(left-x2-50, top-y2-50) if desired_x is not None: x = desired_x for y in range(top, bot-h, int((bot-top-h)/10)): if not self._canvas.find_overlapping(x-5, y-5, x+w+5, y+h+5): return (x,y) if desired_y is not None: y = desired_y for x in range(left, right-w, int((right-left-w)/10)): if not self._canvas.find_overlapping(x-5, y-5, x+w+5, y+h+5): return (x,y) for y in range(top, bot-h, int((bot-top-h)/10)): for x in range(left, right-w, int((right-left-w)/10)): if not self._canvas.find_overlapping(x-5, y-5, x+w+5, y+h+5): return (x,y) return (0,0) def destroy_widget(self, canvaswidget): """ Remove a canvas widget from this ``CanvasFrame``. This deregisters the canvas widget, and destroys it. """ self.remove_widget(canvaswidget) canvaswidget.destroy() def remove_widget(self, canvaswidget): # Deregister with scrollwatcher. self._scrollwatcher.remove_child(canvaswidget) def pack(self, cnf={}, **kw): """ Pack this ``CanvasFrame``. See the documentation for ``Tkinter.Pack`` for more information. """ self._frame.pack(cnf, **kw) # Adjust to be big enough for kids? def destroy(self, *e): """ Destroy this ``CanvasFrame``. If this ``CanvasFrame`` created a top-level window, then this will close that window. """ if self._parent is None: return self._parent.destroy() self._parent = None def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this frame is created from a non-interactive program (e.g. from a secript); otherwise, the frame will close as soon as the script completes. """ if in_idle(): return self._parent.mainloop(*args, **kwargs) ##////////////////////////////////////////////////////// ## Text display ##////////////////////////////////////////////////////// class ShowText(object): """ A ``Tkinter`` window used to display a text. ``ShowText`` is typically used by graphical tools to display help text, or similar information. """ def __init__(self, root, title, text, width=None, height=None, **textbox_options): if width is None or height is None: (width, height) = self.find_dimentions(text, width, height) # Create the main window. if root is None: self._top = top = Tk() else: self._top = top = Toplevel(root) top.title(title) b = Button(top, text='Ok', command=self.destroy) b.pack(side='bottom') tbf = Frame(top) tbf.pack(expand=1, fill='both') scrollbar = Scrollbar(tbf, orient='vertical') scrollbar.pack(side='right', fill='y') textbox = Text(tbf, wrap='word', width=width, height=height, **textbox_options) textbox.insert('end', text) textbox['state'] = 'disabled' textbox.pack(side='left', expand=1, fill='both') scrollbar['command'] = textbox.yview textbox['yscrollcommand'] = scrollbar.set # Make it easy to close the window. top.bind('q', self.destroy) top.bind('x', self.destroy) top.bind('c', self.destroy) top.bind('', self.destroy) top.bind('', self.destroy) # Focus the scrollbar, so they can use up/down, etc. scrollbar.focus() def find_dimentions(self, text, width, height): lines = text.split('\n') if width is None: maxwidth = max(len(line) for line in lines) width = min(maxwidth, 80) # Now, find height. height = 0 for line in lines: while len(line) > width: brk = line[:width].rfind(' ') line = line[brk:] height += 1 height += 1 height = min(height, 25) return (width, height) def destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this window is created from a non-interactive program (e.g. from a secript); otherwise, the window will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) ##////////////////////////////////////////////////////// ## Entry dialog ##////////////////////////////////////////////////////// class EntryDialog(object): """ A dialog box for entering """ def __init__(self, parent, original_text='', instructions='', set_callback=None, title=None): self._parent = parent self._original_text = original_text self._set_callback = set_callback width = int(max(30, len(original_text)*3/2)) self._top = Toplevel(parent) if title: self._top.title(title) # The text entry box. entryframe = Frame(self._top) entryframe.pack(expand=1, fill='both', padx=5, pady=5,ipady=10) if instructions: l=Label(entryframe, text=instructions) l.pack(side='top', anchor='w', padx=30) self._entry = Entry(entryframe, width=width) self._entry.pack(expand=1, fill='x', padx=30) self._entry.insert(0, original_text) # A divider divider = Frame(self._top, borderwidth=1, relief='sunken') divider.pack(fill='x', ipady=1, padx=10) # The buttons. buttons = Frame(self._top) buttons.pack(expand=0, fill='x', padx=5, pady=5) b = Button(buttons, text='Cancel', command=self._cancel, width=8) b.pack(side='right', padx=5) b = Button(buttons, text='Ok', command=self._ok, width=8, default='active') b.pack(side='left', padx=5) b = Button(buttons, text='Apply', command=self._apply, width=8) b.pack(side='left') self._top.bind('', self._ok) self._top.bind('', self._cancel) self._top.bind('', self._cancel) self._entry.focus() def _reset(self, *e): self._entry.delete(0,'end') self._entry.insert(0, self._original_text) if self._set_callback: self._set_callback(self._original_text) def _cancel(self, *e): try: self._reset() except: pass self._destroy() def _ok(self, *e): self._apply() self._destroy() def _apply(self, *e): if self._set_callback: self._set_callback(self._entry.get()) def _destroy(self, *e): if self._top is None: return self._top.destroy() self._top = None ##////////////////////////////////////////////////////// ## Colorized List ##////////////////////////////////////////////////////// class ColorizedList(object): """ An abstract base class for displaying a colorized list of items. Subclasses should define: - ``_init_colortags``, which sets up Text color tags that will be used by the list. - ``_item_repr``, which returns a list of (text,colortag) tuples that make up the colorized representation of the item. :note: Typically, you will want to register a callback for ``'select'`` that calls ``mark`` on the given item. """ def __init__(self, parent, items=[], **options): """ Construct a new list. :param parent: The Tk widget that contains the colorized list :param items: The initial contents of the colorized list. :param options: """ self._parent = parent self._callbacks = {} # Which items are marked? self._marks = {} # Initialize the Tkinter frames. self._init_itemframe(options.copy()) # Set up key & mouse bindings. self._textwidget.bind('', self._keypress) self._textwidget.bind('', self._buttonpress) # Fill in the given CFG's items. self._items = None self.set(items) #//////////////////////////////////////////////////////////// # Abstract methods #//////////////////////////////////////////////////////////// def _init_colortags(self, textwidget, options): """ Set up any colortags that will be used by this colorized list. E.g.: >>> textwidget.tag_config('terminal', foreground='black') """ raise NotImplementedError() def _item_repr(self, item): """ Return a list of (text, colortag) tuples that make up the colorized representation of the item. Colorized representations may not span multiple lines. I.e., the text strings returned may not contain newline characters. """ raise NotImplementedError() #//////////////////////////////////////////////////////////// # Item Access #//////////////////////////////////////////////////////////// def get(self, index=None): """ :return: A list of the items contained by this list. """ if index is None: return self._items[:] else: return self._items[index] def set(self, items): """ Modify the list of items contained by this list. """ items = list(items) if self._items == items: return self._items = list(items) self._textwidget['state'] = 'normal' self._textwidget.delete('1.0', 'end') for item in items: for (text, colortag) in self._item_repr(item): assert '\n' not in text, 'item repr may not contain newline' self._textwidget.insert('end', text, colortag) self._textwidget.insert('end', '\n') # Remove the final newline self._textwidget.delete('end-1char', 'end') self._textwidget.mark_set('insert', '1.0') self._textwidget['state'] = 'disabled' # Clear all marks self._marks.clear() def unmark(self, item=None): """ Remove highlighting from the given item; or from every item, if no item is given. :raise ValueError: If ``item`` is not contained in the list. :raise KeyError: If ``item`` is not marked. """ if item is None: self._marks.clear() self._textwidget.tag_remove('highlight', '1.0', 'end+1char') else: index = self._items.index(item) del self._marks[item] (start, end) = ('%d.0' % (index+1), '%d.0' % (index+2)) self._textwidget.tag_remove('highlight', start, end) def mark(self, item): """ Highlight the given item. :raise ValueError: If ``item`` is not contained in the list. """ self._marks[item] = 1 index = self._items.index(item) (start, end) = ('%d.0' % (index+1), '%d.0' % (index+2)) self._textwidget.tag_add('highlight', start, end) def markonly(self, item): """ Remove any current highlighting, and mark the given item. :raise ValueError: If ``item`` is not contained in the list. """ self.unmark() self.mark(item) def view(self, item): """ Adjust the view such that the given item is visible. If the item is already visible, then do nothing. """ index = self._items.index(item) self._textwidget.see('%d.0' % (index+1)) #//////////////////////////////////////////////////////////// # Callbacks #//////////////////////////////////////////////////////////// def add_callback(self, event, func): """ Register a callback function with the list. This function will be called whenever the given event occurs. :param event: The event that will trigger the callback function. Valid events are: click1, click2, click3, space, return, select, up, down, next, prior, move :param func: The function that should be called when the event occurs. ``func`` will be called with a single item as its argument. (The item selected or the item moved to). """ if event == 'select': events = ['click1', 'space', 'return'] elif event == 'move': events = ['up', 'down', 'next', 'prior'] else: events = [event] for e in events: self._callbacks.setdefault(e,{})[func] = 1 def remove_callback(self, event, func=None): """ Deregister a callback function. If ``func`` is none, then all callbacks are removed for the given event. """ if event is None: events = list(self._callbacks.keys()) elif event == 'select': events = ['click1', 'space', 'return'] elif event == 'move': events = ['up', 'down', 'next', 'prior'] else: events = [event] for e in events: if func is None: del self._callbacks[e] else: try: del self._callbacks[e][func] except: pass #//////////////////////////////////////////////////////////// # Tkinter Methods #//////////////////////////////////////////////////////////// def pack(self, cnf={}, **kw): # "@include: Tkinter.Pack.pack" self._itemframe.pack(cnf, **kw) def grid(self, cnf={}, **kw): # "@include: Tkinter.Grid.grid" self._itemframe.grid(cnf, *kw) def focus(self): # "@include: Tkinter.Widget.focus" self._textwidget.focus() #//////////////////////////////////////////////////////////// # Internal Methods #//////////////////////////////////////////////////////////// def _init_itemframe(self, options): self._itemframe = Frame(self._parent) # Create the basic Text widget & scrollbar. options.setdefault('background', '#e0e0e0') self._textwidget = Text(self._itemframe, **options) self._textscroll = Scrollbar(self._itemframe, takefocus=0, orient='vertical') self._textwidget.config(yscrollcommand = self._textscroll.set) self._textscroll.config(command=self._textwidget.yview) self._textscroll.pack(side='right', fill='y') self._textwidget.pack(expand=1, fill='both', side='left') # Initialize the colorization tags self._textwidget.tag_config('highlight', background='#e0ffff', border='1', relief='raised') self._init_colortags(self._textwidget, options) # How do I want to mark keyboard selection? self._textwidget.tag_config('sel', foreground='') self._textwidget.tag_config('sel', foreground='', background='', border='', underline=1) self._textwidget.tag_lower('highlight', 'sel') def _fire_callback(self, event, itemnum): if event not in self._callbacks: return if 0 <= itemnum < len(self._items): item = self._items[itemnum] else: item = None for cb_func in list(self._callbacks[event].keys()): cb_func(item) def _buttonpress(self, event): clickloc = '@%d,%d' % (event.x,event.y) insert_point = self._textwidget.index(clickloc) itemnum = int(insert_point.split('.')[0])-1 self._fire_callback('click%d' % event.num, itemnum) def _keypress(self, event): if event.keysym == 'Return' or event.keysym == 'space': insert_point = self._textwidget.index('insert') itemnum = int(insert_point.split('.')[0])-1 self._fire_callback(event.keysym.lower(), itemnum) return elif event.keysym == 'Down': delta='+1line' elif event.keysym == 'Up': delta='-1line' elif event.keysym == 'Next': delta='+10lines' elif event.keysym == 'Prior': delta='-10lines' else: return 'continue' self._textwidget.mark_set('insert', 'insert'+delta) self._textwidget.see('insert') self._textwidget.tag_remove('sel', '1.0', 'end+1char') self._textwidget.tag_add('sel', 'insert linestart', 'insert lineend') insert_point = self._textwidget.index('insert') itemnum = int(insert_point.split('.')[0])-1 self._fire_callback(event.keysym.lower(), itemnum) return 'break' ##////////////////////////////////////////////////////// ## Improved OptionMenu ##////////////////////////////////////////////////////// class MutableOptionMenu(Menubutton): def __init__(self, master, values, **options): self._callback = options.get('command') if 'command' in options: del options['command'] # Create a variable self._variable = variable = StringVar() if len(values) > 0: variable.set(values[0]) kw = {"borderwidth": 2, "textvariable": variable, "indicatoron": 1, "relief": RAISED, "anchor": "c", "highlightthickness": 2} kw.update(options) Widget.__init__(self, master, "menubutton", kw) self.widgetName = 'tk_optionMenu' self._menu = Menu(self, name="menu", tearoff=0,) self.menuname = self._menu._w self._values = [] for value in values: self.add(value) self["menu"] = self._menu def add(self, value): if value in self._values: return def set(value=value): self.set(value) self._menu.add_command(label=value, command=set) self._values.append(value) def set(self, value): self._variable.set(value) if self._callback: self._callback(value) def remove(self, value): # Might raise indexerror: pass to parent. i = self._values.index(value) del self._values[i] self._menu.delete(i, i) def __getitem__(self, name): if name == 'menu': return self.__menu return Widget.__getitem__(self, name) def destroy(self): """Destroy this widget and the associated menu.""" Menubutton.destroy(self) self._menu = None ##////////////////////////////////////////////////////// ## Test code. ##////////////////////////////////////////////////////// def demo(): """ A simple demonstration showing how to use canvas widgets. """ def fill(cw): from random import randint cw['fill'] = '#00%04d' % randint(0,9999) def color(cw): from random import randint cw['color'] = '#ff%04d' % randint(0,9999) cf = CanvasFrame(closeenough=10, width=300, height=300) c = cf.canvas() ct3 = TextWidget(c, 'hiya there', draggable=1) ct2 = TextWidget(c, 'o o\n||\n___\n U', draggable=1, justify='center') co = OvalWidget(c, ct2, outline='red') ct = TextWidget(c, 'o o\n||\n\\___/', draggable=1, justify='center') cp = ParenWidget(c, ct, color='red') cb = BoxWidget(c, cp, fill='cyan', draggable=1, width=3, margin=10) equation = SequenceWidget(c, SymbolWidget(c, 'forall'), TextWidget(c, 'x'), SymbolWidget(c, 'exists'), TextWidget(c, 'y: '), TextWidget(c, 'x'), SymbolWidget(c, 'notequal'), TextWidget(c, 'y')) space = SpaceWidget(c, 0, 30) cstack = StackWidget(c, cb, ct3, space, co, equation, align='center') foo = TextWidget(c, 'try clicking\nand dragging', draggable=1, justify='center') cs = SequenceWidget(c, cstack, foo) zz = BracketWidget(c, cs, color='green4', width=3) cf.add_widget(zz, 60, 30) cb.bind_click(fill) ct.bind_click(color) co.bind_click(fill) ct2.bind_click(color) ct3.bind_click(color) cf.mainloop() #ShowText(None, 'title', ((('this is text'*150)+'\n')*5)) if __name__ == '__main__': demo() nltk-3.1/nltk/featstruct.py0000644000076500000240000030744612607224144015602 0ustar sbstaff00000000000000# Natural Language Toolkit: Feature Structures # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper , # Rob Speer, # Steven Bird # URL: # For license information, see LICENSE.TXT """ Basic data classes for representing feature structures, and for performing basic operations on those feature structures. A feature structure is a mapping from feature identifiers to feature values, where each feature value is either a basic value (such as a string or an integer), or a nested feature structure. There are two types of feature structure, implemented by two subclasses of ``FeatStruct``: - feature dictionaries, implemented by ``FeatDict``, act like Python dictionaries. Feature identifiers may be strings or instances of the ``Feature`` class. - feature lists, implemented by ``FeatList``, act like Python lists. Feature identifiers are integers. Feature structures are typically used to represent partial information about objects. A feature identifier that is not mapped to a value stands for a feature whose value is unknown (*not* a feature without a value). Two feature structures that represent (potentially overlapping) information about the same object can be combined by unification. When two inconsistent feature structures are unified, the unification fails and returns None. Features can be specified using "feature paths", or tuples of feature identifiers that specify path through the nested feature structures to a value. Feature structures may contain reentrant feature values. A "reentrant feature value" is a single feature value that can be accessed via multiple feature paths. Unification preserves the reentrance relations imposed by both of the unified feature structures. In the feature structure resulting from unification, any modifications to a reentrant feature value will be visible using any of its feature paths. Feature structure variables are encoded using the ``nltk.sem.Variable`` class. The variables' values are tracked using a bindings dictionary, which maps variables to their values. When two feature structures are unified, a fresh bindings dictionary is created to track their values; and before unification completes, all bound variables are replaced by their values. Thus, the bindings dictionaries are usually strictly internal to the unification process. However, it is possible to track the bindings of variables if you choose to, by supplying your own initial bindings dictionary to the ``unify()`` function. When unbound variables are unified with one another, they become aliased. This is encoded by binding one variable to the other. Lightweight Feature Structures ============================== Many of the functions defined by ``nltk.featstruct`` can be applied directly to simple Python dictionaries and lists, rather than to full-fledged ``FeatDict`` and ``FeatList`` objects. In other words, Python ``dicts`` and ``lists`` can be used as "light-weight" feature structures. >>> from nltk.featstruct import unify >>> unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b'))) # doctest: +SKIP {'y': {'b': 'b'}, 'x': 1, 'a': 'a'} However, you should keep in mind the following caveats: - Python dictionaries & lists ignore reentrance when checking for equality between values. But two FeatStructs with different reentrances are considered nonequal, even if all their base values are equal. - FeatStructs can be easily frozen, allowing them to be used as keys in hash tables. Python dictionaries and lists can not. - FeatStructs display reentrance in their string representations; Python dictionaries and lists do not. - FeatStructs may *not* be mixed with Python dictionaries and lists (e.g., when performing unification). - FeatStructs provide a number of useful methods, such as ``walk()`` and ``cyclic()``, which are not available for Python dicts and lists. In general, if your feature structures will contain any reentrances, or if you plan to use them as dictionary keys, it is strongly recommended that you use full-fledged ``FeatStruct`` objects. """ from __future__ import print_function, unicode_literals, division import re import copy from nltk.internals import read_str, raise_unorderable_types from nltk.sem.logic import (Variable, Expression, SubstituteBindingsI, LogicParser, LogicalExpressionException) from nltk.compat import (string_types, integer_types, total_ordering, python_2_unicode_compatible, unicode_repr) ###################################################################### # Feature Structure ###################################################################### @total_ordering class FeatStruct(SubstituteBindingsI): """ A mapping from feature identifiers to feature values, where each feature value is either a basic value (such as a string or an integer), or a nested feature structure. There are two types of feature structure: - feature dictionaries, implemented by ``FeatDict``, act like Python dictionaries. Feature identifiers may be strings or instances of the ``Feature`` class. - feature lists, implemented by ``FeatList``, act like Python lists. Feature identifiers are integers. Feature structures may be indexed using either simple feature identifiers or 'feature paths.' A feature path is a sequence of feature identifiers that stand for a corresponding sequence of indexing operations. In particular, ``fstruct[(f1,f2,...,fn)]`` is equivalent to ``fstruct[f1][f2]...[fn]``. Feature structures may contain reentrant feature structures. A "reentrant feature structure" is a single feature structure object that can be accessed via multiple feature paths. Feature structures may also be cyclic. A feature structure is "cyclic" if there is any feature path from the feature structure to itself. Two feature structures are considered equal if they assign the same values to all features, and have the same reentrancies. By default, feature structures are mutable. They may be made immutable with the ``freeze()`` method. Once they have been frozen, they may be hashed, and thus used as dictionary keys. """ _frozen = False """:ivar: A flag indicating whether this feature structure is frozen or not. Once this flag is set, it should never be un-set; and no further modification should be made to this feature structue.""" ##//////////////////////////////////////////////////////////// #{ Constructor ##//////////////////////////////////////////////////////////// def __new__(cls, features=None, **morefeatures): """ Construct and return a new feature structure. If this constructor is called directly, then the returned feature structure will be an instance of either the ``FeatDict`` class or the ``FeatList`` class. :param features: The initial feature values for this feature structure: - FeatStruct(string) -> FeatStructReader().read(string) - FeatStruct(mapping) -> FeatDict(mapping) - FeatStruct(sequence) -> FeatList(sequence) - FeatStruct() -> FeatDict() :param morefeatures: If ``features`` is a mapping or None, then ``morefeatures`` provides additional features for the ``FeatDict`` constructor. """ # If the FeatStruct constructor is called directly, then decide # whether to create a FeatDict or a FeatList, based on the # contents of the `features` argument. if cls is FeatStruct: if features is None: return FeatDict.__new__(FeatDict, **morefeatures) elif _is_mapping(features): return FeatDict.__new__(FeatDict, features, **morefeatures) elif morefeatures: raise TypeError('Keyword arguments may only be specified ' 'if features is None or is a mapping.') if isinstance(features, string_types): if FeatStructReader._START_FDICT_RE.match(features): return FeatDict.__new__(FeatDict, features, **morefeatures) else: return FeatList.__new__(FeatList, features, **morefeatures) elif _is_sequence(features): return FeatList.__new__(FeatList, features) else: raise TypeError('Expected string or mapping or sequence') # Otherwise, construct the object as normal. else: return super(FeatStruct, cls).__new__(cls, features, **morefeatures) ##//////////////////////////////////////////////////////////// #{ Uniform Accessor Methods ##//////////////////////////////////////////////////////////// # These helper functions allow the methods defined by FeatStruct # to treat all feature structures as mappings, even if they're # really lists. (Lists are treated as mappings from ints to vals) def _keys(self): """Return an iterable of the feature identifiers used by this FeatStruct.""" raise NotImplementedError() # Implemented by subclasses. def _values(self): """Return an iterable of the feature values directly defined by this FeatStruct.""" raise NotImplementedError() # Implemented by subclasses. def _items(self): """Return an iterable of (fid,fval) pairs, where fid is a feature identifier and fval is the corresponding feature value, for all features defined by this FeatStruct.""" raise NotImplementedError() # Implemented by subclasses. ##//////////////////////////////////////////////////////////// #{ Equality & Hashing ##//////////////////////////////////////////////////////////// def equal_values(self, other, check_reentrance=False): """ Return True if ``self`` and ``other`` assign the same value to to every feature. In particular, return true if ``self[p]==other[p]`` for every feature path *p* such that ``self[p]`` or ``other[p]`` is a base value (i.e., not a nested feature structure). :param check_reentrance: If True, then also return False if there is any difference between the reentrances of ``self`` and ``other``. :note: the ``==`` is equivalent to ``equal_values()`` with ``check_reentrance=True``. """ return self._equal(other, check_reentrance, set(), set(), set()) def __eq__(self, other): """ Return true if ``self`` and ``other`` are both feature structures, assign the same values to all features, and contain the same reentrances. I.e., return ``self.equal_values(other, check_reentrance=True)``. :see: ``equal_values()`` """ return self._equal(other, True, set(), set(), set()) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, FeatStruct): # raise_unorderable_types("<", self, other) # Sometimes feature values can be pure strings, # so we need to be able to compare with non-featstructs: return self.__class__.__name__ < other.__class__.__name__ else: return len(self) < len(other) def __hash__(self): """ If this feature structure is frozen, return its hash value; otherwise, raise ``TypeError``. """ if not self._frozen: raise TypeError('FeatStructs must be frozen before they ' 'can be hashed.') try: return self._hash except AttributeError: self._hash = self._calculate_hashvalue(set()) return self._hash def _equal(self, other, check_reentrance, visited_self, visited_other, visited_pairs): """ Return True iff self and other have equal values. :param visited_self: A set containing the ids of all ``self`` feature structures we've already visited. :param visited_other: A set containing the ids of all ``other`` feature structures we've already visited. :param visited_pairs: A set containing ``(selfid, otherid)`` pairs for all pairs of feature structures we've already visited. """ # If we're the same object, then we're equal. if self is other: return True # If we have different classes, we're definitely not equal. if self.__class__ != other.__class__: return False # If we define different features, we're definitely not equal. # (Perform len test first because it's faster -- we should # do profiling to see if this actually helps) if len(self) != len(other): return False if set(self._keys()) != set(other._keys()): return False # If we're checking reentrance, then any time we revisit a # structure, make sure that it was paired with the same # feature structure that it is now. Note: if check_reentrance, # then visited_pairs will never contain two pairs whose first # values are equal, or two pairs whose second values are equal. if check_reentrance: if id(self) in visited_self or id(other) in visited_other: return (id(self), id(other)) in visited_pairs # If we're not checking reentrance, then we still need to deal # with cycles. If we encounter the same (self, other) pair a # second time, then we won't learn anything more by examining # their children a second time, so just return true. else: if (id(self), id(other)) in visited_pairs: return True # Keep track of which nodes we've visited. visited_self.add(id(self)) visited_other.add(id(other)) visited_pairs.add( (id(self), id(other)) ) # Now we have to check all values. If any of them don't match, # then return false. for (fname, self_fval) in self._items(): other_fval = other[fname] if isinstance(self_fval, FeatStruct): if not self_fval._equal(other_fval, check_reentrance, visited_self, visited_other, visited_pairs): return False else: if self_fval != other_fval: return False # Everything matched up; return true. return True def _calculate_hashvalue(self, visited): """ Return a hash value for this feature structure. :require: ``self`` must be frozen. :param visited: A set containing the ids of all feature structures we've already visited while hashing. """ if id(self) in visited: return 1 visited.add(id(self)) hashval = 5831 for (fname, fval) in sorted(self._items()): hashval *= 37 hashval += hash(fname) hashval *= 37 if isinstance(fval, FeatStruct): hashval += fval._calculate_hashvalue(visited) else: hashval += hash(fval) # Convert to a 32 bit int. hashval = int(hashval & 0x7fffffff) return hashval ##//////////////////////////////////////////////////////////// #{ Freezing ##//////////////////////////////////////////////////////////// #: Error message used by mutating methods when called on a frozen #: feature structure. _FROZEN_ERROR = "Frozen FeatStructs may not be modified." def freeze(self): """ Make this feature structure, and any feature structures it contains, immutable. Note: this method does not attempt to 'freeze' any feature value that is not a ``FeatStruct``; it is recommended that you use only immutable feature values. """ if self._frozen: return self._freeze(set()) def frozen(self): """ Return True if this feature structure is immutable. Feature structures can be made immutable with the ``freeze()`` method. Immutable feature structures may not be made mutable again, but new mutable copies can be produced with the ``copy()`` method. """ return self._frozen def _freeze(self, visited): """ Make this feature structure, and any feature structure it contains, immutable. :param visited: A set containing the ids of all feature structures we've already visited while freezing. """ if id(self) in visited: return visited.add(id(self)) self._frozen = True for (fname, fval) in sorted(self._items()): if isinstance(fval, FeatStruct): fval._freeze(visited) ##//////////////////////////////////////////////////////////// #{ Copying ##//////////////////////////////////////////////////////////// def copy(self, deep=True): """ Return a new copy of ``self``. The new copy will not be frozen. :param deep: If true, create a deep copy; if false, create a shallow copy. """ if deep: return copy.deepcopy(self) else: return self.__class__(self) # Subclasses should define __deepcopy__ to ensure that the new # copy will not be frozen. def __deepcopy__(self, memo): raise NotImplementedError() # Implemented by subclasses. ##//////////////////////////////////////////////////////////// #{ Structural Information ##//////////////////////////////////////////////////////////// def cyclic(self): """ Return True if this feature structure contains itself. """ return self._find_reentrances({})[id(self)] def walk(self): """ Return an iterator that generates this feature structure, and each feature structure it contains. Each feature structure will be generated exactly once. """ return self._walk(set()) def _walk(self, visited): """ Return an iterator that generates this feature structure, and each feature structure it contains. :param visited: A set containing the ids of all feature structures we've already visited while freezing. """ raise NotImplementedError() # Implemented by subclasses. def _walk(self, visited): if id(self) in visited: return visited.add(id(self)) yield self for fval in self._values(): if isinstance(fval, FeatStruct): for elt in fval._walk(visited): yield elt # Walk through the feature tree. The first time we see a feature # value, map it to False (not reentrant). If we see a feature # value more than once, then map it to True (reentrant). def _find_reentrances(self, reentrances): """ Return a dictionary that maps from the ``id`` of each feature structure contained in ``self`` (including ``self``) to a boolean value, indicating whether it is reentrant or not. """ if id(self) in reentrances: # We've seen it more than once. reentrances[id(self)] = True else: # This is the first time we've seen it. reentrances[id(self)] = False # Recurse to contained feature structures. for fval in self._values(): if isinstance(fval, FeatStruct): fval._find_reentrances(reentrances) return reentrances ##//////////////////////////////////////////////////////////// #{ Variables & Bindings ##//////////////////////////////////////////////////////////// def substitute_bindings(self, bindings): """:see: ``nltk.featstruct.substitute_bindings()``""" return substitute_bindings(self, bindings) def retract_bindings(self, bindings): """:see: ``nltk.featstruct.retract_bindings()``""" return retract_bindings(self, bindings) def variables(self): """:see: ``nltk.featstruct.find_variables()``""" return find_variables(self) def rename_variables(self, vars=None, used_vars=(), new_vars=None): """:see: ``nltk.featstruct.rename_variables()``""" return rename_variables(self, vars, used_vars, new_vars) def remove_variables(self): """ Return the feature structure that is obtained by deleting any feature whose value is a ``Variable``. :rtype: FeatStruct """ return remove_variables(self) ##//////////////////////////////////////////////////////////// #{ Unification ##//////////////////////////////////////////////////////////// def unify(self, other, bindings=None, trace=False, fail=None, rename_vars=True): return unify(self, other, bindings, trace, fail, rename_vars) def subsumes(self, other): """ Return True if ``self`` subsumes ``other``. I.e., return true If unifying ``self`` with ``other`` would result in a feature structure equal to ``other``. """ return subsumes(self, other) ##//////////////////////////////////////////////////////////// #{ String Representations ##//////////////////////////////////////////////////////////// def __repr__(self): """ Display a single-line representation of this feature structure, suitable for embedding in other representations. """ return self._repr(self._find_reentrances({}), {}) def _repr(self, reentrances, reentrance_ids): """ Return a string representation of this feature structure. :param reentrances: A dictionary that maps from the ``id`` of each feature value in self, indicating whether that value is reentrant or not. :param reentrance_ids: A dictionary mapping from each ``id`` of a feature value to a unique identifier. This is modified by ``repr``: the first time a reentrant feature value is displayed, an identifier is added to ``reentrance_ids`` for it. """ raise NotImplementedError() # Mutation: disable if frozen. _FROZEN_ERROR = "Frozen FeatStructs may not be modified." _FROZEN_NOTICE = "\n%sIf self is frozen, raise ValueError." def _check_frozen(method, indent=''): """ Given a method function, return a new method function that first checks if ``self._frozen`` is true; and if so, raises ``ValueError`` with an appropriate message. Otherwise, call the method and return its result. """ def wrapped(self, *args, **kwargs): if self._frozen: raise ValueError(_FROZEN_ERROR) else: return method(self, *args, **kwargs) wrapped.__name__ = method.__name__ wrapped.__doc__ = (method.__doc__ or '') + (_FROZEN_NOTICE % indent) return wrapped ###################################################################### # Feature Dictionary ###################################################################### @python_2_unicode_compatible class FeatDict(FeatStruct, dict): """ A feature structure that acts like a Python dictionary. I.e., a mapping from feature identifiers to feature values, where a feature identifier can be a string or a ``Feature``; and where a feature value can be either a basic value (such as a string or an integer), or a nested feature structure. A feature identifiers for a ``FeatDict`` is sometimes called a "feature name". Two feature dicts are considered equal if they assign the same values to all features, and have the same reentrances. :see: ``FeatStruct`` for information about feature paths, reentrance, cyclic feature structures, mutability, freezing, and hashing. """ def __init__(self, features=None, **morefeatures): """ Create a new feature dictionary, with the specified features. :param features: The initial value for this feature dictionary. If ``features`` is a ``FeatStruct``, then its features are copied (shallow copy). If ``features`` is a dict, then a feature is created for each item, mapping its key to its value. If ``features`` is a string, then it is processed using ``FeatStructReader``. If ``features`` is a list of tuples ``(name, val)``, then a feature is created for each tuple. :param morefeatures: Additional features for the new feature dictionary. If a feature is listed under both ``features`` and ``morefeatures``, then the value from ``morefeatures`` will be used. """ if isinstance(features, string_types): FeatStructReader().fromstring(features, self) self.update(**morefeatures) else: # update() checks the types of features. self.update(features, **morefeatures) #//////////////////////////////////////////////////////////// #{ Dict methods #//////////////////////////////////////////////////////////// _INDEX_ERROR = str("Expected feature name or path. Got %r.") def __getitem__(self, name_or_path): """If the feature with the given name or path exists, return its value; otherwise, raise ``KeyError``.""" if isinstance(name_or_path, (string_types, Feature)): return dict.__getitem__(self, name_or_path) elif isinstance(name_or_path, tuple): try: val = self for fid in name_or_path: if not isinstance(val, FeatStruct): raise KeyError # path contains base value val = val[fid] return val except (KeyError, IndexError): raise KeyError(name_or_path) else: raise TypeError(self._INDEX_ERROR % name_or_path) def get(self, name_or_path, default=None): """If the feature with the given name or path exists, return its value; otherwise, return ``default``.""" try: return self[name_or_path] except KeyError: return default def __contains__(self, name_or_path): """Return true if a feature with the given name or path exists.""" try: self[name_or_path]; return True except KeyError: return False def has_key(self, name_or_path): """Return true if a feature with the given name or path exists.""" return name_or_path in self def __delitem__(self, name_or_path): """If the feature with the given name or path exists, delete its value; otherwise, raise ``KeyError``.""" if self._frozen: raise ValueError(_FROZEN_ERROR) if isinstance(name_or_path, (string_types, Feature)): return dict.__delitem__(self, name_or_path) elif isinstance(name_or_path, tuple): if len(name_or_path) == 0: raise ValueError("The path () can not be set") else: parent = self[name_or_path[:-1]] if not isinstance(parent, FeatStruct): raise KeyError(name_or_path) # path contains base value del parent[name_or_path[-1]] else: raise TypeError(self._INDEX_ERROR % name_or_path) def __setitem__(self, name_or_path, value): """Set the value for the feature with the given name or path to ``value``. If ``name_or_path`` is an invalid path, raise ``KeyError``.""" if self._frozen: raise ValueError(_FROZEN_ERROR) if isinstance(name_or_path, (string_types, Feature)): return dict.__setitem__(self, name_or_path, value) elif isinstance(name_or_path, tuple): if len(name_or_path) == 0: raise ValueError("The path () can not be set") else: parent = self[name_or_path[:-1]] if not isinstance(parent, FeatStruct): raise KeyError(name_or_path) # path contains base value parent[name_or_path[-1]] = value else: raise TypeError(self._INDEX_ERROR % name_or_path) clear = _check_frozen(dict.clear) pop = _check_frozen(dict.pop) popitem = _check_frozen(dict.popitem) setdefault = _check_frozen(dict.setdefault) def update(self, features=None, **morefeatures): if self._frozen: raise ValueError(_FROZEN_ERROR) if features is None: items = () elif hasattr(features, 'items') and callable(features.items): items = features.items() elif hasattr(features, '__iter__'): items = features else: raise ValueError('Expected mapping or list of tuples') for key, val in items: if not isinstance(key, (string_types, Feature)): raise TypeError('Feature names must be strings') self[key] = val for key, val in morefeatures.items(): if not isinstance(key, (string_types, Feature)): raise TypeError('Feature names must be strings') self[key] = val ##//////////////////////////////////////////////////////////// #{ Copying ##//////////////////////////////////////////////////////////// def __deepcopy__(self, memo): memo[id(self)] = selfcopy = self.__class__() for (key, val) in self._items(): selfcopy[copy.deepcopy(key,memo)] = copy.deepcopy(val,memo) return selfcopy ##//////////////////////////////////////////////////////////// #{ Uniform Accessor Methods ##//////////////////////////////////////////////////////////// def _keys(self): return self.keys() def _values(self): return self.values() def _items(self): return self.items() ##//////////////////////////////////////////////////////////// #{ String Representations ##//////////////////////////////////////////////////////////// def __str__(self): """ Display a multi-line representation of this feature dictionary as an FVM (feature value matrix). """ return '\n'.join(self._str(self._find_reentrances({}), {})) def _repr(self, reentrances, reentrance_ids): segments = [] prefix = '' suffix = '' # If this is the first time we've seen a reentrant structure, # then assign it a unique identifier. if reentrances[id(self)]: assert id(self) not in reentrance_ids reentrance_ids[id(self)] = repr(len(reentrance_ids)+1) # sorting note: keys are unique strings, so we'll never fall # through to comparing values. for (fname, fval) in sorted(self.items()): display = getattr(fname, 'display', None) if id(fval) in reentrance_ids: segments.append('%s->(%s)' % (fname, reentrance_ids[id(fval)])) elif (display == 'prefix' and not prefix and isinstance(fval, (Variable, string_types))): prefix = '%s' % fval elif display == 'slash' and not suffix: if isinstance(fval, Variable): suffix = '/%s' % fval.name else: suffix = '/%s' % unicode_repr(fval) elif isinstance(fval, Variable): segments.append('%s=%s' % (fname, fval.name)) elif fval is True: segments.append('+%s' % fname) elif fval is False: segments.append('-%s' % fname) elif isinstance(fval, Expression): segments.append('%s=<%s>' % (fname, fval)) elif not isinstance(fval, FeatStruct): segments.append('%s=%s' % (fname, unicode_repr(fval))) else: fval_repr = fval._repr(reentrances, reentrance_ids) segments.append('%s=%s' % (fname, fval_repr)) # If it's reentrant, then add on an identifier tag. if reentrances[id(self)]: prefix = '(%s)%s' % (reentrance_ids[id(self)], prefix) return '%s[%s]%s' % (prefix, ', '.join(segments), suffix) def _str(self, reentrances, reentrance_ids): """ :return: A list of lines composing a string representation of this feature dictionary. :param reentrances: A dictionary that maps from the ``id`` of each feature value in self, indicating whether that value is reentrant or not. :param reentrance_ids: A dictionary mapping from each ``id`` of a feature value to a unique identifier. This is modified by ``repr``: the first time a reentrant feature value is displayed, an identifier is added to ``reentrance_ids`` for it. """ # If this is the first time we've seen a reentrant structure, # then tack on an id string. if reentrances[id(self)]: assert id(self) not in reentrance_ids reentrance_ids[id(self)] = repr(len(reentrance_ids)+1) # Special case: empty feature dict. if len(self) == 0: if reentrances[id(self)]: return ['(%s) []' % reentrance_ids[id(self)]] else: return ['[]'] # What's the longest feature name? Use this to align names. maxfnamelen = max(len("%s" % k) for k in self.keys()) lines = [] # sorting note: keys are unique strings, so we'll never fall # through to comparing values. for (fname, fval) in sorted(self.items()): fname = ("%s" % fname).ljust(maxfnamelen) if isinstance(fval, Variable): lines.append('%s = %s' % (fname,fval.name)) elif isinstance(fval, Expression): lines.append('%s = <%s>' % (fname, fval)) elif isinstance(fval, FeatList): fval_repr = fval._repr(reentrances, reentrance_ids) lines.append('%s = %s' % (fname, unicode_repr(fval_repr))) elif not isinstance(fval, FeatDict): # It's not a nested feature structure -- just print it. lines.append('%s = %s' % (fname, unicode_repr(fval))) elif id(fval) in reentrance_ids: # It's a feature structure we've seen before -- print # the reentrance id. lines.append('%s -> (%s)' % (fname, reentrance_ids[id(fval)])) else: # It's a new feature structure. Separate it from # other values by a blank line. if lines and lines[-1] != '': lines.append('') # Recursively print the feature's value (fval). fval_lines = fval._str(reentrances, reentrance_ids) # Indent each line to make room for fname. fval_lines = [(' '*(maxfnamelen+3))+l for l in fval_lines] # Pick which line we'll display fname on, & splice it in. nameline = (len(fval_lines)-1) // 2 fval_lines[nameline] = ( fname+' ='+fval_lines[nameline][maxfnamelen+2:]) # Add the feature structure to the output. lines += fval_lines # Separate FeatStructs by a blank line. lines.append('') # Get rid of any excess blank lines. if lines[-1] == '': lines.pop() # Add brackets around everything. maxlen = max(len(line) for line in lines) lines = ['[ %s%s ]' % (line, ' '*(maxlen-len(line))) for line in lines] # If it's reentrant, then add on an identifier tag. if reentrances[id(self)]: idstr = '(%s) ' % reentrance_ids[id(self)] lines = [(' '*len(idstr))+l for l in lines] idline = (len(lines)-1) // 2 lines[idline] = idstr + lines[idline][len(idstr):] return lines ###################################################################### # Feature List ###################################################################### class FeatList(FeatStruct, list): """ A list of feature values, where each feature value is either a basic value (such as a string or an integer), or a nested feature structure. Feature lists may contain reentrant feature values. A "reentrant feature value" is a single feature value that can be accessed via multiple feature paths. Feature lists may also be cyclic. Two feature lists are considered equal if they assign the same values to all features, and have the same reentrances. :see: ``FeatStruct`` for information about feature paths, reentrance, cyclic feature structures, mutability, freezing, and hashing. """ def __init__(self, features=()): """ Create a new feature list, with the specified features. :param features: The initial list of features for this feature list. If ``features`` is a string, then it is paresd using ``FeatStructReader``. Otherwise, it should be a sequence of basic values and nested feature structures. """ if isinstance(features, string_types): FeatStructReader().fromstring(features, self) else: list.__init__(self, features) #//////////////////////////////////////////////////////////// #{ List methods #//////////////////////////////////////////////////////////// _INDEX_ERROR = "Expected int or feature path. Got %r." def __getitem__(self, name_or_path): if isinstance(name_or_path, integer_types): return list.__getitem__(self, name_or_path) elif isinstance(name_or_path, tuple): try: val = self for fid in name_or_path: if not isinstance(val, FeatStruct): raise KeyError # path contains base value val = val[fid] return val except (KeyError, IndexError): raise KeyError(name_or_path) else: raise TypeError(self._INDEX_ERROR % name_or_path) def __delitem__(self, name_or_path): """If the feature with the given name or path exists, delete its value; otherwise, raise ``KeyError``.""" if self._frozen: raise ValueError(_FROZEN_ERROR) if isinstance(name_or_path, (integer_types, slice)): return list.__delitem__(self, name_or_path) elif isinstance(name_or_path, tuple): if len(name_or_path) == 0: raise ValueError("The path () can not be set") else: parent = self[name_or_path[:-1]] if not isinstance(parent, FeatStruct): raise KeyError(name_or_path) # path contains base value del parent[name_or_path[-1]] else: raise TypeError(self._INDEX_ERROR % name_or_path) def __setitem__(self, name_or_path, value): """Set the value for the feature with the given name or path to ``value``. If ``name_or_path`` is an invalid path, raise ``KeyError``.""" if self._frozen: raise ValueError(_FROZEN_ERROR) if isinstance(name_or_path, (integer_types, slice)): return list.__setitem__(self, name_or_path, value) elif isinstance(name_or_path, tuple): if len(name_or_path) == 0: raise ValueError("The path () can not be set") else: parent = self[name_or_path[:-1]] if not isinstance(parent, FeatStruct): raise KeyError(name_or_path) # path contains base value parent[name_or_path[-1]] = value else: raise TypeError(self._INDEX_ERROR % name_or_path) # __delslice__ = _check_frozen(list.__delslice__, ' ') # __setslice__ = _check_frozen(list.__setslice__, ' ') __iadd__ = _check_frozen(list.__iadd__) __imul__ = _check_frozen(list.__imul__) append = _check_frozen(list.append) extend = _check_frozen(list.extend) insert = _check_frozen(list.insert) pop = _check_frozen(list.pop) remove = _check_frozen(list.remove) reverse = _check_frozen(list.reverse) sort = _check_frozen(list.sort) ##//////////////////////////////////////////////////////////// #{ Copying ##//////////////////////////////////////////////////////////// def __deepcopy__(self, memo): memo[id(self)] = selfcopy = self.__class__() selfcopy.extend(copy.deepcopy(fval,memo) for fval in self) return selfcopy ##//////////////////////////////////////////////////////////// #{ Uniform Accessor Methods ##//////////////////////////////////////////////////////////// def _keys(self): return list(range(len(self))) def _values(self): return self def _items(self): return enumerate(self) ##//////////////////////////////////////////////////////////// #{ String Representations ##//////////////////////////////////////////////////////////// # Special handling for: reentrances, variables, expressions. def _repr(self, reentrances, reentrance_ids): # If this is the first time we've seen a reentrant structure, # then assign it a unique identifier. if reentrances[id(self)]: assert id(self) not in reentrance_ids reentrance_ids[id(self)] = repr(len(reentrance_ids)+1) prefix = '(%s)' % reentrance_ids[id(self)] else: prefix = '' segments = [] for fval in self: if id(fval) in reentrance_ids: segments.append('->(%s)' % reentrance_ids[id(fval)]) elif isinstance(fval, Variable): segments.append(fval.name) elif isinstance(fval, Expression): segments.append('%s' % fval) elif isinstance(fval, FeatStruct): segments.append(fval._repr(reentrances, reentrance_ids)) else: segments.append('%s' % unicode_repr(fval)) return '%s[%s]' % (prefix, ', '.join(segments)) ###################################################################### # Variables & Bindings ###################################################################### def substitute_bindings(fstruct, bindings, fs_class='default'): """ Return the feature structure that is obtained by replacing each variable bound by ``bindings`` with its binding. If a variable is aliased to a bound variable, then it will be replaced by that variable's value. If a variable is aliased to an unbound variable, then it will be replaced by that variable. :type bindings: dict(Variable -> any) :param bindings: A dictionary mapping from variables to values. """ if fs_class == 'default': fs_class = _default_fs_class(fstruct) fstruct = copy.deepcopy(fstruct) _substitute_bindings(fstruct, bindings, fs_class, set()) return fstruct def _substitute_bindings(fstruct, bindings, fs_class, visited): # Visit each node only once: if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError('Expected mapping or sequence') for (fname, fval) in items: while (isinstance(fval, Variable) and fval in bindings): fval = fstruct[fname] = bindings[fval] if isinstance(fval, fs_class): _substitute_bindings(fval, bindings, fs_class, visited) elif isinstance(fval, SubstituteBindingsI): fstruct[fname] = fval.substitute_bindings(bindings) def retract_bindings(fstruct, bindings, fs_class='default'): """ Return the feature structure that is obtained by replacing each feature structure value that is bound by ``bindings`` with the variable that binds it. A feature structure value must be identical to a bound value (i.e., have equal id) to be replaced. ``bindings`` is modified to point to this new feature structure, rather than the original feature structure. Feature structure values in ``bindings`` may be modified if they are contained in ``fstruct``. """ if fs_class == 'default': fs_class = _default_fs_class(fstruct) (fstruct, new_bindings) = copy.deepcopy((fstruct, bindings)) bindings.update(new_bindings) inv_bindings = dict((id(val),var) for (var,val) in bindings.items()) _retract_bindings(fstruct, inv_bindings, fs_class, set()) return fstruct def _retract_bindings(fstruct, inv_bindings, fs_class, visited): # Visit each node only once: if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError('Expected mapping or sequence') for (fname, fval) in items: if isinstance(fval, fs_class): if id(fval) in inv_bindings: fstruct[fname] = inv_bindings[id(fval)] _retract_bindings(fval, inv_bindings, fs_class, visited) def find_variables(fstruct, fs_class='default'): """ :return: The set of variables used by this feature structure. :rtype: set(Variable) """ if fs_class == 'default': fs_class = _default_fs_class(fstruct) return _variables(fstruct, set(), fs_class, set()) def _variables(fstruct, vars, fs_class, visited): # Visit each node only once: if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError('Expected mapping or sequence') for (fname, fval) in items: if isinstance(fval, Variable): vars.add(fval) elif isinstance(fval, fs_class): _variables(fval, vars, fs_class, visited) elif isinstance(fval, SubstituteBindingsI): vars.update(fval.variables()) return vars def rename_variables(fstruct, vars=None, used_vars=(), new_vars=None, fs_class='default'): """ Return the feature structure that is obtained by replacing any of this feature structure's variables that are in ``vars`` with new variables. The names for these new variables will be names that are not used by any variable in ``vars``, or in ``used_vars``, or in this feature structure. :type vars: set :param vars: The set of variables that should be renamed. If not specified, ``find_variables(fstruct)`` is used; i.e., all variables will be given new names. :type used_vars: set :param used_vars: A set of variables whose names should not be used by the new variables. :type new_vars: dict(Variable -> Variable) :param new_vars: A dictionary that is used to hold the mapping from old variables to new variables. For each variable *v* in this feature structure: - If ``new_vars`` maps *v* to *v'*, then *v* will be replaced by *v'*. - If ``new_vars`` does not contain *v*, but ``vars`` does contain *v*, then a new entry will be added to ``new_vars``, mapping *v* to the new variable that is used to replace it. To consistently rename the variables in a set of feature structures, simply apply rename_variables to each one, using the same dictionary: >>> from nltk.featstruct import FeatStruct >>> fstruct1 = FeatStruct('[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]') >>> fstruct2 = FeatStruct('[subj=[agr=[number=?z,gender=?y]], obj=[agr=[number=?z,gender=?y]]]') >>> new_vars = {} # Maps old vars to alpha-renamed vars >>> fstruct1.rename_variables(new_vars=new_vars) [obj=[agr=[gender=?y2]], subj=[agr=[gender=?y2]]] >>> fstruct2.rename_variables(new_vars=new_vars) [obj=[agr=[gender=?y2, number=?z2]], subj=[agr=[gender=?y2, number=?z2]]] If new_vars is not specified, then an empty dictionary is used. """ if fs_class == 'default': fs_class = _default_fs_class(fstruct) # Default values: if new_vars is None: new_vars = {} if vars is None: vars = find_variables(fstruct, fs_class) else: vars = set(vars) # Add our own variables to used_vars. used_vars = find_variables(fstruct, fs_class).union(used_vars) # Copy ourselves, and rename variables in the copy. return _rename_variables(copy.deepcopy(fstruct), vars, used_vars, new_vars, fs_class, set()) def _rename_variables(fstruct, vars, used_vars, new_vars, fs_class, visited): if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError('Expected mapping or sequence') for (fname, fval) in items: if isinstance(fval, Variable): # If it's in new_vars, then rebind it. if fval in new_vars: fstruct[fname] = new_vars[fval] # If it's in vars, pick a new name for it. elif fval in vars: new_vars[fval] = _rename_variable(fval, used_vars) fstruct[fname] = new_vars[fval] used_vars.add(new_vars[fval]) elif isinstance(fval, fs_class): _rename_variables(fval, vars, used_vars, new_vars, fs_class, visited) elif isinstance(fval, SubstituteBindingsI): # Pick new names for any variables in `vars` for var in fval.variables(): if var in vars and var not in new_vars: new_vars[var] = _rename_variable(var, used_vars) used_vars.add(new_vars[var]) # Replace all variables in `new_vars`. fstruct[fname] = fval.substitute_bindings(new_vars) return fstruct def _rename_variable(var, used_vars): name, n = re.sub('\d+$', '', var.name), 2 if not name: name = '?' while Variable('%s%s' % (name, n)) in used_vars: n += 1 return Variable('%s%s' % (name, n)) def remove_variables(fstruct, fs_class='default'): """ :rtype: FeatStruct :return: The feature structure that is obtained by deleting all features whose values are ``Variables``. """ if fs_class == 'default': fs_class = _default_fs_class(fstruct) return _remove_variables(copy.deepcopy(fstruct), fs_class, set()) def _remove_variables(fstruct, fs_class, visited): if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = list(fstruct.items()) elif _is_sequence(fstruct): items = list(enumerate(fstruct)) else: raise ValueError('Expected mapping or sequence') for (fname, fval) in items: if isinstance(fval, Variable): del fstruct[fname] elif isinstance(fval, fs_class): _remove_variables(fval, fs_class, visited) return fstruct ###################################################################### # Unification ###################################################################### @python_2_unicode_compatible class _UnificationFailure(object): def __repr__(self): return 'nltk.featstruct.UnificationFailure' UnificationFailure = _UnificationFailure() """A unique value used to indicate unification failure. It can be returned by ``Feature.unify_base_values()`` or by custom ``fail()`` functions to indicate that unificaiton should fail.""" # The basic unification algorithm: # 1. Make copies of self and other (preserving reentrance) # 2. Destructively unify self and other # 3. Apply forward pointers, to preserve reentrance. # 4. Replace bound variables with their values. def unify(fstruct1, fstruct2, bindings=None, trace=False, fail=None, rename_vars=True, fs_class='default'): """ Unify ``fstruct1`` with ``fstruct2``, and return the resulting feature structure. This unified feature structure is the minimal feature structure that contains all feature value assignments from both ``fstruct1`` and ``fstruct2``, and that preserves all reentrancies. If no such feature structure exists (because ``fstruct1`` and ``fstruct2`` specify incompatible values for some feature), then unification fails, and ``unify`` returns None. Bound variables are replaced by their values. Aliased variables are replaced by their representative variable (if unbound) or the value of their representative variable (if bound). I.e., if variable *v* is in ``bindings``, then *v* is replaced by ``bindings[v]``. This will be repeated until the variable is replaced by an unbound variable or a non-variable value. Unbound variables are bound when they are unified with values; and aliased when they are unified with variables. I.e., if variable *v* is not in ``bindings``, and is unified with a variable or value *x*, then ``bindings[v]`` is set to *x*. If ``bindings`` is unspecified, then all variables are assumed to be unbound. I.e., ``bindings`` defaults to an empty dict. >>> from nltk.featstruct import FeatStruct >>> FeatStruct('[a=?x]').unify(FeatStruct('[b=?x]')) [a=?x, b=?x2] :type bindings: dict(Variable -> any) :param bindings: A set of variable bindings to be used and updated during unification. :type trace: bool :param trace: If true, generate trace output. :type rename_vars: bool :param rename_vars: If True, then rename any variables in ``fstruct2`` that are also used in ``fstruct1``, in order to avoid collisions on variable names. """ # Decide which class(es) will be treated as feature structures, # for the purposes of unification. if fs_class == 'default': fs_class = _default_fs_class(fstruct1) if _default_fs_class(fstruct2) != fs_class: raise ValueError("Mixing FeatStruct objects with Python " "dicts and lists is not supported.") assert isinstance(fstruct1, fs_class) assert isinstance(fstruct2, fs_class) # If bindings are unspecified, use an empty set of bindings. user_bindings = (bindings is not None) if bindings is None: bindings = {} # Make copies of fstruct1 and fstruct2 (since the unification # algorithm is destructive). Do it all at once, to preserve # reentrance links between fstruct1 and fstruct2. Copy bindings # as well, in case there are any bound vars that contain parts # of fstruct1 or fstruct2. (fstruct1copy, fstruct2copy, bindings_copy) = ( copy.deepcopy((fstruct1, fstruct2, bindings))) # Copy the bindings back to the original bindings dict. bindings.update(bindings_copy) if rename_vars: vars1 = find_variables(fstruct1copy, fs_class) vars2 = find_variables(fstruct2copy, fs_class) _rename_variables(fstruct2copy, vars1, vars2, {}, fs_class, set()) # Do the actual unification. If it fails, return None. forward = {} if trace: _trace_unify_start((), fstruct1copy, fstruct2copy) try: result = _destructively_unify(fstruct1copy, fstruct2copy, bindings, forward, trace, fail, fs_class, ()) except _UnificationFailureError: return None # _destructively_unify might return UnificationFailure, e.g. if we # tried to unify a mapping with a sequence. if result is UnificationFailure: if fail is None: return None else: return fail(fstruct1copy, fstruct2copy, ()) # Replace any feature structure that has a forward pointer # with the target of its forward pointer. result = _apply_forwards(result, forward, fs_class, set()) if user_bindings: _apply_forwards_to_bindings(forward, bindings) # Replace bound vars with values. _resolve_aliases(bindings) _substitute_bindings(result, bindings, fs_class, set()) # Return the result. if trace: _trace_unify_succeed((), result) if trace: _trace_bindings((), bindings) return result class _UnificationFailureError(Exception): """An exception that is used by ``_destructively_unify`` to abort unification when a failure is encountered.""" def _destructively_unify(fstruct1, fstruct2, bindings, forward, trace, fail, fs_class, path): """ Attempt to unify ``fstruct1`` and ``fstruct2`` by modifying them in-place. If the unification succeeds, then ``fstruct1`` will contain the unified value, the value of ``fstruct2`` is undefined, and forward[id(fstruct2)] is set to fstruct1. If the unification fails, then a _UnificationFailureError is raised, and the values of ``fstruct1`` and ``fstruct2`` are undefined. :param bindings: A dictionary mapping variables to values. :param forward: A dictionary mapping feature structures ids to replacement structures. When two feature structures are merged, a mapping from one to the other will be added to the forward dictionary; and changes will be made only to the target of the forward dictionary. ``_destructively_unify`` will always 'follow' any links in the forward dictionary for fstruct1 and fstruct2 before actually unifying them. :param trace: If true, generate trace output :param path: The feature path that led us to this unification step. Used for trace output. """ # If fstruct1 is already identical to fstruct2, we're done. # Note: this, together with the forward pointers, ensures # that unification will terminate even for cyclic structures. if fstruct1 is fstruct2: if trace: _trace_unify_identity(path, fstruct1) return fstruct1 # Set fstruct2's forward pointer to point to fstruct1; this makes # fstruct1 the canonical copy for fstruct2. Note that we need to # do this before we recurse into any child structures, in case # they're cyclic. forward[id(fstruct2)] = fstruct1 # Unifying two mappings: if _is_mapping(fstruct1) and _is_mapping(fstruct2): for fname in fstruct1: if getattr(fname, 'default', None) is not None: fstruct2.setdefault(fname, fname.default) for fname in fstruct2: if getattr(fname, 'default', None) is not None: fstruct1.setdefault(fname, fname.default) # Unify any values that are defined in both fstruct1 and # fstruct2. Copy any values that are defined in fstruct2 but # not in fstruct1 to fstruct1. Note: sorting fstruct2's # features isn't actually necessary; but we do it to give # deterministic behavior, e.g. for tracing. for fname, fval2 in sorted(fstruct2.items()): if fname in fstruct1: fstruct1[fname] = _unify_feature_values( fname, fstruct1[fname], fval2, bindings, forward, trace, fail, fs_class, path+(fname,)) else: fstruct1[fname] = fval2 return fstruct1 # Contains the unified value. # Unifying two sequences: elif _is_sequence(fstruct1) and _is_sequence(fstruct2): # If the lengths don't match, fail. if len(fstruct1) != len(fstruct2): return UnificationFailure # Unify corresponding values in fstruct1 and fstruct2. for findex in range(len(fstruct1)): fstruct1[findex] = _unify_feature_values( findex, fstruct1[findex], fstruct2[findex], bindings, forward, trace, fail, fs_class, path+(findex,)) return fstruct1 # Contains the unified value. # Unifying sequence & mapping: fail. The failure function # doesn't get a chance to recover in this case. elif ((_is_sequence(fstruct1) or _is_mapping(fstruct1)) and (_is_sequence(fstruct2) or _is_mapping(fstruct2))): return UnificationFailure # Unifying anything else: not allowed! raise TypeError('Expected mappings or sequences') def _unify_feature_values(fname, fval1, fval2, bindings, forward, trace, fail, fs_class, fpath): """ Attempt to unify ``fval1`` and and ``fval2``, and return the resulting unified value. The method of unification will depend on the types of ``fval1`` and ``fval2``: 1. If they're both feature structures, then destructively unify them (see ``_destructively_unify()``. 2. If they're both unbound variables, then alias one variable to the other (by setting bindings[v2]=v1). 3. If one is an unbound variable, and the other is a value, then bind the unbound variable to the value. 4. If one is a feature structure, and the other is a base value, then fail. 5. If they're both base values, then unify them. By default, this will succeed if they are equal, and fail otherwise. """ if trace: _trace_unify_start(fpath, fval1, fval2) # Look up the "canonical" copy of fval1 and fval2 while id(fval1) in forward: fval1 = forward[id(fval1)] while id(fval2) in forward: fval2 = forward[id(fval2)] # If fval1 or fval2 is a bound variable, then # replace it by the variable's bound value. This # includes aliased variables, which are encoded as # variables bound to other variables. fvar1 = fvar2 = None while isinstance(fval1, Variable) and fval1 in bindings: fvar1 = fval1 fval1 = bindings[fval1] while isinstance(fval2, Variable) and fval2 in bindings: fvar2 = fval2 fval2 = bindings[fval2] # Case 1: Two feature structures (recursive case) if isinstance(fval1, fs_class) and isinstance(fval2, fs_class): result = _destructively_unify(fval1, fval2, bindings, forward, trace, fail, fs_class, fpath) # Case 2: Two unbound variables (create alias) elif (isinstance(fval1, Variable) and isinstance(fval2, Variable)): if fval1 != fval2: bindings[fval2] = fval1 result = fval1 # Case 3: An unbound variable and a value (bind) elif isinstance(fval1, Variable): bindings[fval1] = fval2 result = fval1 elif isinstance(fval2, Variable): bindings[fval2] = fval1 result = fval2 # Case 4: A feature structure & a base value (fail) elif isinstance(fval1, fs_class) or isinstance(fval2, fs_class): result = UnificationFailure # Case 5: Two base values else: # Case 5a: Feature defines a custom unification method for base values if isinstance(fname, Feature): result = fname.unify_base_values(fval1, fval2, bindings) # Case 5b: Feature value defines custom unification method elif isinstance(fval1, CustomFeatureValue): result = fval1.unify(fval2) # Sanity check: unify value should be symmetric if (isinstance(fval2, CustomFeatureValue) and result != fval2.unify(fval1)): raise AssertionError( 'CustomFeatureValue objects %r and %r disagree ' 'about unification value: %r vs. %r' % (fval1, fval2, result, fval2.unify(fval1))) elif isinstance(fval2, CustomFeatureValue): result = fval2.unify(fval1) # Case 5c: Simple values -- check if they're equal. else: if fval1 == fval2: result = fval1 else: result = UnificationFailure # If either value was a bound variable, then update the # bindings. (This is really only necessary if fname is a # Feature or if either value is a CustomFeatureValue.) if result is not UnificationFailure: if fvar1 is not None: bindings[fvar1] = result result = fvar1 if fvar2 is not None and fvar2 != fvar1: bindings[fvar2] = result result = fvar2 # If we unification failed, call the failure function; it # might decide to continue anyway. if result is UnificationFailure: if fail is not None: result = fail(fval1, fval2, fpath) if trace: _trace_unify_fail(fpath[:-1], result) if result is UnificationFailure: raise _UnificationFailureError # Normalize the result. if isinstance(result, fs_class): result = _apply_forwards(result, forward, fs_class, set()) if trace: _trace_unify_succeed(fpath, result) if trace and isinstance(result, fs_class): _trace_bindings(fpath, bindings) return result def _apply_forwards_to_bindings(forward, bindings): """ Replace any feature structure that has a forward pointer with the target of its forward pointer (to preserve reentrancy). """ for (var, value) in bindings.items(): while id(value) in forward: value = forward[id(value)] bindings[var] = value def _apply_forwards(fstruct, forward, fs_class, visited): """ Replace any feature structure that has a forward pointer with the target of its forward pointer (to preserve reentrancy). """ # Follow our own forwards pointers (if any) while id(fstruct) in forward: fstruct = forward[id(fstruct)] # Visit each node only once: if id(fstruct) in visited: return visited.add(id(fstruct)) if _is_mapping(fstruct): items = fstruct.items() elif _is_sequence(fstruct): items = enumerate(fstruct) else: raise ValueError('Expected mapping or sequence') for fname, fval in items: if isinstance(fval, fs_class): # Replace w/ forwarded value. while id(fval) in forward: fval = forward[id(fval)] fstruct[fname] = fval # Recurse to child. _apply_forwards(fval, forward, fs_class, visited) return fstruct def _resolve_aliases(bindings): """ Replace any bound aliased vars with their binding; and replace any unbound aliased vars with their representative var. """ for (var, value) in bindings.items(): while isinstance(value, Variable) and value in bindings: value = bindings[var] = bindings[value] def _trace_unify_start(path, fval1, fval2): if path == (): print('\nUnification trace:') else: fullname = '.'.join("%s" % n for n in path) print(' '+'| '*(len(path)-1)+'|') print(' '+'| '*(len(path)-1)+'| Unify feature: %s' % fullname) print(' '+'| '*len(path)+' / '+_trace_valrepr(fval1)) print(' '+'| '*len(path)+'|\\ '+_trace_valrepr(fval2)) def _trace_unify_identity(path, fval1): print(' '+'| '*len(path)+'|') print(' '+'| '*len(path)+'| (identical objects)') print(' '+'| '*len(path)+'|') print(' '+'| '*len(path)+'+-->'+unicode_repr(fval1)) def _trace_unify_fail(path, result): if result is UnificationFailure: resume = '' else: resume = ' (nonfatal)' print(' '+'| '*len(path)+'| |') print(' '+'X '*len(path)+'X X <-- FAIL'+resume) def _trace_unify_succeed(path, fval1): # Print the result. print(' '+'| '*len(path)+'|') print(' '+'| '*len(path)+'+-->'+unicode_repr(fval1)) def _trace_bindings(path, bindings): # Print the bindings (if any). if len(bindings) > 0: binditems = sorted(bindings.items(), key=lambda v:v[0].name) bindstr = '{%s}' % ', '.join( '%s: %s' % (var, _trace_valrepr(val)) for (var, val) in binditems) print(' '+'| '*len(path)+' Bindings: '+bindstr) def _trace_valrepr(val): if isinstance(val, Variable): return '%s' % val else: return '%s' % unicode_repr(val) def subsumes(fstruct1, fstruct2): """ Return True if ``fstruct1`` subsumes ``fstruct2``. I.e., return true if unifying ``fstruct1`` with ``fstruct2`` would result in a feature structure equal to ``fstruct2.`` :rtype: bool """ return fstruct2 == unify(fstruct1, fstruct2) def conflicts(fstruct1, fstruct2, trace=0): """ Return a list of the feature paths of all features which are assigned incompatible values by ``fstruct1`` and ``fstruct2``. :rtype: list(tuple) """ conflict_list = [] def add_conflict(fval1, fval2, path): conflict_list.append(path) return fval1 unify(fstruct1, fstruct2, fail=add_conflict, trace=trace) return conflict_list ###################################################################### # Helper Functions ###################################################################### def _is_mapping(v): return hasattr(v, '__contains__') and hasattr(v, 'keys') def _is_sequence(v): return (hasattr(v, '__iter__') and hasattr(v, '__len__') and not isinstance(v, string_types)) def _default_fs_class(obj): if isinstance(obj, FeatStruct): return FeatStruct if isinstance(obj, (dict, list)): return (dict, list) else: raise ValueError('To unify objects of type %s, you must specify ' 'fs_class explicitly.' % obj.__class__.__name__) ###################################################################### # FeatureValueSet & FeatureValueTuple ###################################################################### class SubstituteBindingsSequence(SubstituteBindingsI): """ A mixin class for sequence clases that distributes variables() and substitute_bindings() over the object's elements. """ def variables(self): return ([elt for elt in self if isinstance(elt, Variable)] + sum([list(elt.variables()) for elt in self if isinstance(elt, SubstituteBindingsI)], [])) def substitute_bindings(self, bindings): return self.__class__([self.subst(v, bindings) for v in self]) def subst(self, v, bindings): if isinstance(v, SubstituteBindingsI): return v.substitute_bindings(bindings) else: return bindings.get(v, v) @python_2_unicode_compatible class FeatureValueTuple(SubstituteBindingsSequence, tuple): """ A base feature value that is a tuple of other base feature values. FeatureValueTuple implements ``SubstituteBindingsI``, so it any variable substitutions will be propagated to the elements contained by the set. A ``FeatureValueTuple`` is immutable. """ def __repr__(self): # [xx] really use %s here? if len(self) == 0: return '()' return '(%s)' % ', '.join('%s' % (b,) for b in self) @python_2_unicode_compatible class FeatureValueSet(SubstituteBindingsSequence, frozenset): """ A base feature value that is a set of other base feature values. FeatureValueSet implements ``SubstituteBindingsI``, so it any variable substitutions will be propagated to the elements contained by the set. A ``FeatureValueSet`` is immutable. """ def __repr__(self): # [xx] really use %s here? if len(self) == 0: return '{/}' # distinguish from dict. # n.b., we sort the string reprs of our elements, to ensure # that our own repr is deterministic. return '{%s}' % ', '.join(sorted('%s' % (b,) for b in self)) __str__ = __repr__ @python_2_unicode_compatible class FeatureValueUnion(SubstituteBindingsSequence, frozenset): """ A base feature value that represents the union of two or more ``FeatureValueSet`` or ``Variable``. """ def __new__(cls, values): # If values contains FeatureValueUnions, then collapse them. values = _flatten(values, FeatureValueUnion) # If the resulting list contains no variables, then # use a simple FeatureValueSet instead. if sum(isinstance(v, Variable) for v in values) == 0: values = _flatten(values, FeatureValueSet) return FeatureValueSet(values) # If we contain a single variable, return that variable. if len(values) == 1: return list(values)[0] # Otherwise, build the FeatureValueUnion. return frozenset.__new__(cls, values) def __repr__(self): # n.b., we sort the string reprs of our elements, to ensure # that our own repr is deterministic. also, note that len(self) # is guaranteed to be 2 or more. return '{%s}' % '+'.join(sorted('%s' % (b,) for b in self)) @python_2_unicode_compatible class FeatureValueConcat(SubstituteBindingsSequence, tuple): """ A base feature value that represents the concatenation of two or more ``FeatureValueTuple`` or ``Variable``. """ def __new__(cls, values): # If values contains FeatureValueConcats, then collapse them. values = _flatten(values, FeatureValueConcat) # If the resulting list contains no variables, then # use a simple FeatureValueTuple instead. if sum(isinstance(v, Variable) for v in values) == 0: values = _flatten(values, FeatureValueTuple) return FeatureValueTuple(values) # If we contain a single variable, return that variable. if len(values) == 1: return list(values)[0] # Otherwise, build the FeatureValueConcat. return tuple.__new__(cls, values) def __repr__(self): # n.b.: len(self) is guaranteed to be 2 or more. return '(%s)' % '+'.join('%s' % (b,) for b in self) def _flatten(lst, cls): """ Helper function -- return a copy of list, with all elements of type ``cls`` spliced in rather than appended in. """ result = [] for elt in lst: if isinstance(elt, cls): result.extend(elt) else: result.append(elt) return result ###################################################################### # Specialized Features ###################################################################### @total_ordering @python_2_unicode_compatible class Feature(object): """ A feature identifier that's specialized to put additional constraints, default values, etc. """ def __init__(self, name, default=None, display=None): assert display in (None, 'prefix', 'slash') self._name = name # [xx] rename to .identifier? self._default = default # [xx] not implemented yet. self._display = display if self._display == 'prefix': self._sortkey = (-1, self._name) elif self._display == 'slash': self._sortkey = (1, self._name) else: self._sortkey = (0, self._name) @property def name(self): """The name of this feature.""" return self._name @property def default(self): """Default value for this feature.""" return self._default @property def display(self): """Custom display location: can be prefix, or slash.""" return self._display def __repr__(self): return '*%s*' % self.name def __lt__(self, other): if isinstance(other, string_types): return True if not isinstance(other, Feature): raise_unorderable_types("<", self, other) return self._sortkey < other._sortkey def __eq__(self, other): return type(self) == type(other) and self._name == other._name def __ne__(self, other): return not self == other def __hash__(self): return hash(self._name) #//////////////////////////////////////////////////////////// # These can be overridden by subclasses: #//////////////////////////////////////////////////////////// def read_value(self, s, position, reentrances, parser): return parser.read_value(s, position, reentrances) def unify_base_values(self, fval1, fval2, bindings): """ If possible, return a single value.. If not, return the value ``UnificationFailure``. """ if fval1 == fval2: return fval1 else: return UnificationFailure class SlashFeature(Feature): def read_value(self, s, position, reentrances, parser): return parser.read_partial(s, position, reentrances) class RangeFeature(Feature): RANGE_RE = re.compile('(-?\d+):(-?\d+)') def read_value(self, s, position, reentrances, parser): m = self.RANGE_RE.match(s, position) if not m: raise ValueError('range', position) return (int(m.group(1)), int(m.group(2))), m.end() def unify_base_values(self, fval1, fval2, bindings): if fval1 is None: return fval2 if fval2 is None: return fval1 rng = max(fval1[0], fval2[0]), min(fval1[1], fval2[1]) if rng[1] < rng[0]: return UnificationFailure return rng SLASH = SlashFeature('slash', default=False, display='slash') TYPE = Feature('type', display='prefix') ###################################################################### # Specialized Feature Values ###################################################################### @total_ordering class CustomFeatureValue(object): """ An abstract base class for base values that define a custom unification method. The custom unification method of ``CustomFeatureValue`` will be used during unification if: - The ``CustomFeatureValue`` is unified with another base value. - The ``CustomFeatureValue`` is not the value of a customized ``Feature`` (which defines its own unification method). If two ``CustomFeatureValue`` objects are unified with one another during feature structure unification, then the unified base values they return *must* be equal; otherwise, an ``AssertionError`` will be raised. Subclasses must define ``unify()``, ``__eq__()`` and ``__lt__()``. Subclasses may also wish to define ``__hash__()``. """ def unify(self, other): """ If this base value unifies with ``other``, then return the unified value. Otherwise, return ``UnificationFailure``. """ raise NotImplementedError('abstract base class') def __eq__(self, other): raise NotImplementedError('abstract base class') def __ne__(self, other): return not self == other def __lt__(self, other): raise NotImplementedError('abstract base class') def __hash__(self): raise TypeError('%s objects or unhashable' % self.__class__.__name__) ###################################################################### # Feature Structure Reader ###################################################################### class FeatStructReader(object): def __init__(self, features=(SLASH, TYPE), fdict_class=FeatStruct, flist_class=FeatList, logic_parser=None): self._features = dict((f.name,f) for f in features) self._fdict_class = fdict_class self._flist_class = flist_class self._prefix_feature = None self._slash_feature = None for feature in features: if feature.display == 'slash': if self._slash_feature: raise ValueError('Multiple features w/ display=slash') self._slash_feature = feature if feature.display == 'prefix': if self._prefix_feature: raise ValueError('Multiple features w/ display=prefix') self._prefix_feature = feature self._features_with_defaults = [feature for feature in features if feature.default is not None] if logic_parser is None: logic_parser = LogicParser() self._logic_parser = logic_parser def fromstring(self, s, fstruct=None): """ Convert a string representation of a feature structure (as displayed by repr) into a ``FeatStruct``. This process imposes the following restrictions on the string representation: - Feature names cannot contain any of the following: whitespace, parentheses, quote marks, equals signs, dashes, commas, and square brackets. Feature names may not begin with plus signs or minus signs. - Only the following basic feature value are supported: strings, integers, variables, None, and unquoted alphanumeric strings. - For reentrant values, the first mention must specify a reentrance identifier and a value; and any subsequent mentions must use arrows (``'->'``) to reference the reentrance identifier. """ s = s.strip() value, position = self.read_partial(s, 0, {}, fstruct) if position != len(s): self._error(s, 'end of string', position) return value _START_FSTRUCT_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+)?(\[)') _END_FSTRUCT_RE = re.compile(r'\s*]\s*') _SLASH_RE = re.compile(r'/') _FEATURE_NAME_RE = re.compile(r'\s*([+-]?)([^\s\(\)<>"\'\-=\[\],]+)\s*') _REENTRANCE_RE = re.compile(r'\s*->\s*') _TARGET_RE = re.compile(r'\s*\((\d+)\)\s*') _ASSIGN_RE = re.compile(r'\s*=\s*') _COMMA_RE = re.compile(r'\s*,\s*') _BARE_PREFIX_RE = re.compile(r'\s*(?:\((\d+)\)\s*)?(\??[\w-]+\s*)()') # This one is used to distinguish fdicts from flists: _START_FDICT_RE = re.compile(r'(%s)|(%s\s*(%s\s*(=|->)|[+-]%s|\]))' % ( _BARE_PREFIX_RE.pattern, _START_FSTRUCT_RE.pattern, _FEATURE_NAME_RE.pattern, _FEATURE_NAME_RE.pattern)) def read_partial(self, s, position=0, reentrances=None, fstruct=None): """ Helper function that reads in a feature structure. :param s: The string to read. :param position: The position in the string to start parsing. :param reentrances: A dictionary from reentrance ids to values. Defaults to an empty dictionary. :return: A tuple (val, pos) of the feature structure created by parsing and the position where the parsed feature structure ends. :rtype: bool """ if reentrances is None: reentrances = {} try: return self._read_partial(s, position, reentrances, fstruct) except ValueError as e: if len(e.args) != 2: raise self._error(s, *e.args) def _read_partial(self, s, position, reentrances, fstruct=None): # Create the new feature structure if fstruct is None: if self._START_FDICT_RE.match(s, position): fstruct = self._fdict_class() else: fstruct = self._flist_class() # Read up to the open bracket. match = self._START_FSTRUCT_RE.match(s, position) if not match: match = self._BARE_PREFIX_RE.match(s, position) if not match: raise ValueError('open bracket or identifier', position) position = match.end() # If there as an identifier, record it. if match.group(1): identifier = match.group(1) if identifier in reentrances: raise ValueError('new identifier', match.start(1)) reentrances[identifier] = fstruct if isinstance(fstruct, FeatDict): fstruct.clear() return self._read_partial_featdict(s, position, match, reentrances, fstruct) else: del fstruct[:] return self._read_partial_featlist(s, position, match, reentrances, fstruct) def _read_partial_featlist(self, s, position, match, reentrances, fstruct): # Prefix features are not allowed: if match.group(2): raise ValueError('open bracket') # Bare prefixes are not allowed: if not match.group(3): raise ValueError('open bracket') # Build a list of the features defined by the structure. while position < len(s): # Check for the close bracket. match = self._END_FSTRUCT_RE.match(s, position) if match is not None: return fstruct, match.end() # Reentances have the form "-> (target)" match = self._REENTRANCE_RE.match(s, position) if match: position = match.end() match = self._TARGET_RE.match(s, position) if not match: raise ValueError('identifier', position) target = match.group(1) if target not in reentrances: raise ValueError('bound identifier', position) position = match.end() fstruct.append(reentrances[target]) # Anything else is a value. else: value, position = ( self._read_value(0, s, position, reentrances)) fstruct.append(value) # If there's a close bracket, handle it at the top of the loop. if self._END_FSTRUCT_RE.match(s, position): continue # Otherwise, there should be a comma match = self._COMMA_RE.match(s, position) if match is None: raise ValueError('comma', position) position = match.end() # We never saw a close bracket. raise ValueError('close bracket', position) def _read_partial_featdict(self, s, position, match, reentrances, fstruct): # If there was a prefix feature, record it. if match.group(2): if self._prefix_feature is None: raise ValueError('open bracket or identifier', match.start(2)) prefixval = match.group(2).strip() if prefixval.startswith('?'): prefixval = Variable(prefixval) fstruct[self._prefix_feature] = prefixval # If group 3 is empty, then we just have a bare prefix, so # we're done. if not match.group(3): return self._finalize(s, match.end(), reentrances, fstruct) # Build a list of the features defined by the structure. # Each feature has one of the three following forms: # name = value # name -> (target) # +name # -name while position < len(s): # Use these variables to hold info about each feature: name = value = None # Check for the close bracket. match = self._END_FSTRUCT_RE.match(s, position) if match is not None: return self._finalize(s, match.end(), reentrances, fstruct) # Get the feature name's name match = self._FEATURE_NAME_RE.match(s, position) if match is None: raise ValueError('feature name', position) name = match.group(2) position = match.end() # Check if it's a special feature. if name[0] == '*' and name[-1] == '*': name = self._features.get(name[1:-1]) if name is None: raise ValueError('known special feature', match.start(2)) # Check if this feature has a value already. if name in fstruct: raise ValueError('new name', match.start(2)) # Boolean value ("+name" or "-name") if match.group(1) == '+': value = True if match.group(1) == '-': value = False # Reentrance link ("-> (target)") if value is None: match = self._REENTRANCE_RE.match(s, position) if match is not None: position = match.end() match = self._TARGET_RE.match(s, position) if not match: raise ValueError('identifier', position) target = match.group(1) if target not in reentrances: raise ValueError('bound identifier', position) position = match.end() value = reentrances[target] # Assignment ("= value"). if value is None: match = self._ASSIGN_RE.match(s, position) if match: position = match.end() value, position = ( self._read_value(name, s, position, reentrances)) # None of the above: error. else: raise ValueError('equals sign', position) # Store the value. fstruct[name] = value # If there's a close bracket, handle it at the top of the loop. if self._END_FSTRUCT_RE.match(s, position): continue # Otherwise, there should be a comma match = self._COMMA_RE.match(s, position) if match is None: raise ValueError('comma', position) position = match.end() # We never saw a close bracket. raise ValueError('close bracket', position) def _finalize(self, s, pos, reentrances, fstruct): """ Called when we see the close brace -- checks for a slash feature, and adds in default values. """ # Add the slash feature (if any) match = self._SLASH_RE.match(s, pos) if match: name = self._slash_feature v, pos = self._read_value(name, s, match.end(), reentrances) fstruct[name] = v ## Add any default features. -- handle in unficiation instead? #for feature in self._features_with_defaults: # fstruct.setdefault(feature, feature.default) # Return the value. return fstruct, pos def _read_value(self, name, s, position, reentrances): if isinstance(name, Feature): return name.read_value(s, position, reentrances, self) else: return self.read_value(s, position, reentrances) def read_value(self, s, position, reentrances): for (handler, regexp) in self.VALUE_HANDLERS: match = regexp.match(s, position) if match: handler_func = getattr(self, handler) return handler_func(s, position, reentrances, match) raise ValueError('value', position) def _error(self, s, expected, position): lines = s.split('\n') while position > len(lines[0]): position -= len(lines.pop(0))+1 # +1 for the newline. estr = ('Error parsing feature structure\n ' + lines[0] + '\n ' + ' '*position + '^ ' + 'Expected %s' % expected) raise ValueError(estr) #//////////////////////////////////////////////////////////// #{ Value Readers #//////////////////////////////////////////////////////////// #: A table indicating how feature values should be processed. Each #: entry in the table is a pair (handler, regexp). The first entry #: with a matching regexp will have its handler called. Handlers #: should have the following signature:: #: #: def handler(s, position, reentrances, match): ... #: #: and should return a tuple (value, position), where position is #: the string position where the value ended. (n.b.: order is #: important here!) VALUE_HANDLERS = [ ('read_fstruct_value', _START_FSTRUCT_RE), ('read_var_value', re.compile(r'\?[a-zA-Z_][a-zA-Z0-9_]*')), ('read_str_value', re.compile("[uU]?[rR]?(['\"])")), ('read_int_value', re.compile(r'-?\d+')), ('read_sym_value', re.compile(r'[a-zA-Z_][a-zA-Z0-9_]*')), ('read_app_value', re.compile(r'<(app)\((\?[a-z][a-z]*)\s*,' r'\s*(\?[a-z][a-z]*)\)>')), # ('read_logic_value', re.compile(r'<([^>]*)>')), #lazily match any character after '<' until we hit a '>' not preceded by '-' ('read_logic_value', re.compile(r'<(.*?)(?')), ('read_set_value', re.compile(r'{')), ('read_tuple_value', re.compile(r'\(')), ] def read_fstruct_value(self, s, position, reentrances, match): return self.read_partial(s, position, reentrances) def read_str_value(self, s, position, reentrances, match): return read_str(s, position) def read_int_value(self, s, position, reentrances, match): return int(match.group()), match.end() # Note: the '?' is included in the variable name. def read_var_value(self, s, position, reentrances, match): return Variable(match.group()), match.end() _SYM_CONSTS = {'None':None, 'True':True, 'False':False} def read_sym_value(self, s, position, reentrances, match): val, end = match.group(), match.end() return self._SYM_CONSTS.get(val, val), end def read_app_value(self, s, position, reentrances, match): """Mainly included for backwards compat.""" return self._logic_parser.parse('%s(%s)' % match.group(2,3)), match.end() def read_logic_value(self, s, position, reentrances, match): try: try: expr = self._logic_parser.parse(match.group(1)) except LogicalExpressionException: raise ValueError() return expr, match.end() except ValueError: raise ValueError('logic expression', match.start(1)) def read_tuple_value(self, s, position, reentrances, match): return self._read_seq_value(s, position, reentrances, match, ')', FeatureValueTuple, FeatureValueConcat) def read_set_value(self, s, position, reentrances, match): return self._read_seq_value(s, position, reentrances, match, '}', FeatureValueSet, FeatureValueUnion) def _read_seq_value(self, s, position, reentrances, match, close_paren, seq_class, plus_class): """ Helper function used by read_tuple_value and read_set_value. """ cp = re.escape(close_paren) position = match.end() # Special syntax fo empty tuples: m = re.compile(r'\s*/?\s*%s' % cp).match(s, position) if m: return seq_class(), m.end() # Read values: values = [] seen_plus = False while True: # Close paren: return value. m = re.compile(r'\s*%s' % cp).match(s, position) if m: if seen_plus: return plus_class(values), m.end() else: return seq_class(values), m.end() # Read the next value. val, position = self.read_value(s, position, reentrances) values.append(val) # Comma or looking at close paren m = re.compile(r'\s*(,|\+|(?=%s))\s*' % cp).match(s, position) if not m: raise ValueError("',' or '+' or '%s'" % cp, position) if m.group(1) == '+': seen_plus = True position = m.end() ###################################################################### #{ Demo ###################################################################### def display_unification(fs1, fs2, indent=' '): # Print the two input feature structures, side by side. fs1_lines = ("%s" % fs1).split('\n') fs2_lines = ("%s" % fs2).split('\n') if len(fs1_lines) > len(fs2_lines): blankline = '['+' '*(len(fs2_lines[0])-2)+']' fs2_lines += [blankline]*len(fs1_lines) else: blankline = '['+' '*(len(fs1_lines[0])-2)+']' fs1_lines += [blankline]*len(fs2_lines) for (fs1_line, fs2_line) in zip(fs1_lines, fs2_lines): print(indent + fs1_line + ' ' + fs2_line) print(indent+'-'*len(fs1_lines[0])+' '+'-'*len(fs2_lines[0])) linelen = len(fs1_lines[0])*2+3 print(indent+'| |'.center(linelen)) print(indent+'+-----UNIFY-----+'.center(linelen)) print(indent+'|'.center(linelen)) print(indent+'V'.center(linelen)) bindings = {} result = fs1.unify(fs2, bindings) if result is None: print(indent+'(FAILED)'.center(linelen)) else: print('\n'.join(indent+l.center(linelen) for l in ("%s" % result).split('\n'))) if bindings and len(bindings.bound_variables()) > 0: print(repr(bindings).center(linelen)) return result def interactive_demo(trace=False): import random, sys HELP = ''' 1-%d: Select the corresponding feature structure q: Quit t: Turn tracing on or off l: List all feature structures ?: Help ''' print(''' This demo will repeatedly present you with a list of feature structures, and ask you to choose two for unification. Whenever a new feature structure is generated, it is added to the list of choices that you can pick from. However, since this can be a large number of feature structures, the demo will only print out a random subset for you to choose between at a given time. If you want to see the complete lists, type "l". For a list of valid commands, type "?". ''') print('Press "Enter" to continue...') sys.stdin.readline() fstruct_strings = [ '[agr=[number=sing, gender=masc]]', '[agr=[gender=masc, person=3]]', '[agr=[gender=fem, person=3]]', '[subj=[agr=(1)[]], agr->(1)]', '[obj=?x]', '[subj=?x]', '[/=None]', '[/=NP]', '[cat=NP]', '[cat=VP]', '[cat=PP]', '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]', '[gender=masc, agr=?C]', '[gender=?S, agr=[gender=?S,person=3]]' ] all_fstructs = [(i, FeatStruct(fstruct_strings[i])) for i in range(len(fstruct_strings))] def list_fstructs(fstructs): for i, fstruct in fstructs: print() lines = ("%s" % fstruct).split('\n') print('%3d: %s' % (i+1, lines[0])) for line in lines[1:]: print(' '+line) print() while True: # Pick 5 feature structures at random from the master list. MAX_CHOICES = 5 if len(all_fstructs) > MAX_CHOICES: fstructs = sorted(random.sample(all_fstructs, MAX_CHOICES)) else: fstructs = all_fstructs print('_'*75) print('Choose two feature structures to unify:') list_fstructs(fstructs) selected = [None,None] for (nth,i) in (('First',0), ('Second',1)): while selected[i] is None: print(('%s feature structure (1-%d,q,t,l,?): ' % (nth, len(all_fstructs))), end=' ') try: input = sys.stdin.readline().strip() if input in ('q', 'Q', 'x', 'X'): return if input in ('t', 'T'): trace = not trace print(' Trace = %s' % trace) continue if input in ('h', 'H', '?'): print(HELP % len(fstructs)); continue if input in ('l', 'L'): list_fstructs(all_fstructs); continue num = int(input)-1 selected[i] = all_fstructs[num][1] print() except: print('Bad sentence number') continue if trace: result = selected[0].unify(selected[1], trace=1) else: result = display_unification(selected[0], selected[1]) if result is not None: for i, fstruct in all_fstructs: if repr(result) == repr(fstruct): break else: all_fstructs.append((len(all_fstructs), result)) print('\nType "Enter" to continue unifying; or "q" to quit.') input = sys.stdin.readline().strip() if input in ('q', 'Q', 'x', 'X'): return def demo(trace=False): """ Just for testing """ #import random # processor breaks with values like '3rd' fstruct_strings = [ '[agr=[number=sing, gender=masc]]', '[agr=[gender=masc, person=3]]', '[agr=[gender=fem, person=3]]', '[subj=[agr=(1)[]], agr->(1)]', '[obj=?x]', '[subj=?x]', '[/=None]', '[/=NP]', '[cat=NP]', '[cat=VP]', '[cat=PP]', '[subj=[agr=[gender=?y]], obj=[agr=[gender=?y]]]', '[gender=masc, agr=?C]', '[gender=?S, agr=[gender=?S,person=3]]' ] all_fstructs = [FeatStruct(fss) for fss in fstruct_strings] #MAX_CHOICES = 5 #if len(all_fstructs) > MAX_CHOICES: #fstructs = random.sample(all_fstructs, MAX_CHOICES) #fstructs.sort() #else: #fstructs = all_fstructs for fs1 in all_fstructs: for fs2 in all_fstructs: print("\n*******************\nfs1 is:\n%s\n\nfs2 is:\n%s\n\nresult is:\n%s" % (fs1, fs2, unify(fs1, fs2))) if __name__ == '__main__': demo() __all__ = ['FeatStruct', 'FeatDict', 'FeatList', 'unify', 'subsumes', 'conflicts', 'Feature', 'SlashFeature', 'RangeFeature', 'SLASH', 'TYPE', 'FeatStructReader'] nltk-3.1/nltk/grammar.py0000644000076500000240000015075312607224144015041 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Context Free Grammars # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # Jason Narad # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT # """ Basic data classes for representing context free grammars. A "grammar" specifies which trees can represent the structure of a given text. Each of these trees is called a "parse tree" for the text (or simply a "parse"). In a "context free" grammar, the set of parse trees for any piece of a text can depend only on that piece, and not on the rest of the text (i.e., the piece's context). Context free grammars are often used to find possible syntactic structures for sentences. In this context, the leaves of a parse tree are word tokens; and the node values are phrasal categories, such as ``NP`` and ``VP``. The ``CFG`` class is used to encode context free grammars. Each ``CFG`` consists of a start symbol and a set of productions. The "start symbol" specifies the root node value for parse trees. For example, the start symbol for syntactic parsing is usually ``S``. Start symbols are encoded using the ``Nonterminal`` class, which is discussed below. A Grammar's "productions" specify what parent-child relationships a parse tree can contain. Each production specifies that a particular node can be the parent of a particular set of children. For example, the production `` -> `` specifies that an ``S`` node can be the parent of an ``NP`` node and a ``VP`` node. Grammar productions are implemented by the ``Production`` class. Each ``Production`` consists of a left hand side and a right hand side. The "left hand side" is a ``Nonterminal`` that specifies the node type for a potential parent; and the "right hand side" is a list that specifies allowable children for that parent. This lists consists of ``Nonterminals`` and text types: each ``Nonterminal`` indicates that the corresponding child may be a ``TreeToken`` with the specified node type; and each text type indicates that the corresponding child may be a ``Token`` with the with that type. The ``Nonterminal`` class is used to distinguish node values from leaf values. This prevents the grammar from accidentally using a leaf value (such as the English word "A") as the node of a subtree. Within a ``CFG``, all node values are wrapped in the ``Nonterminal`` class. Note, however, that the trees that are specified by the grammar do *not* include these ``Nonterminal`` wrappers. Grammars can also be given a more procedural interpretation. According to this interpretation, a Grammar specifies any tree structure *tree* that can be produced by the following procedure: | Set tree to the start symbol | Repeat until tree contains no more nonterminal leaves: | Choose a production prod with whose left hand side | lhs is a nonterminal leaf of tree. | Replace the nonterminal leaf with a subtree, whose node | value is the value wrapped by the nonterminal lhs, and | whose children are the right hand side of prod. The operation of replacing the left hand side (*lhs*) of a production with the right hand side (*rhs*) in a tree (*tree*) is known as "expanding" *lhs* to *rhs* in *tree*. """ from __future__ import print_function, unicode_literals import re from nltk.util import transitive_closure, invert_graph from nltk.compat import (string_types, total_ordering, text_type, python_2_unicode_compatible, unicode_repr) from nltk.internals import raise_unorderable_types from nltk.probability import ImmutableProbabilisticMixIn from nltk.featstruct import FeatStruct, FeatDict, FeatStructReader, SLASH, TYPE ################################################################# # Nonterminal ################################################################# @total_ordering @python_2_unicode_compatible class Nonterminal(object): """ A non-terminal symbol for a context free grammar. ``Nonterminal`` is a wrapper class for node values; it is used by ``Production`` objects to distinguish node values from leaf values. The node value that is wrapped by a ``Nonterminal`` is known as its "symbol". Symbols are typically strings representing phrasal categories (such as ``"NP"`` or ``"VP"``). However, more complex symbol types are sometimes used (e.g., for lexicalized grammars). Since symbols are node values, they must be immutable and hashable. Two ``Nonterminals`` are considered equal if their symbols are equal. :see: ``CFG``, ``Production`` :type _symbol: any :ivar _symbol: The node value corresponding to this ``Nonterminal``. This value must be immutable and hashable. """ def __init__(self, symbol): """ Construct a new non-terminal from the given symbol. :type symbol: any :param symbol: The node value corresponding to this ``Nonterminal``. This value must be immutable and hashable. """ self._symbol = symbol self._hash = hash(symbol) def symbol(self): """ Return the node value corresponding to this ``Nonterminal``. :rtype: (any) """ return self._symbol def __eq__(self, other): """ Return True if this non-terminal is equal to ``other``. In particular, return True if ``other`` is a ``Nonterminal`` and this non-terminal's symbol is equal to ``other`` 's symbol. :rtype: bool """ return type(self) == type(other) and self._symbol == other._symbol def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, Nonterminal): raise_unorderable_types("<", self, other) return self._symbol < other._symbol def __hash__(self): return self._hash def __repr__(self): """ Return a string representation for this ``Nonterminal``. :rtype: str """ if isinstance(self._symbol, string_types): return '%s' % self._symbol else: return '%s' % unicode_repr(self._symbol) def __str__(self): """ Return a string representation for this ``Nonterminal``. :rtype: str """ if isinstance(self._symbol, string_types): return '%s' % self._symbol else: return '%s' % unicode_repr(self._symbol) def __div__(self, rhs): """ Return a new nonterminal whose symbol is ``A/B``, where ``A`` is the symbol for this nonterminal, and ``B`` is the symbol for rhs. :param rhs: The nonterminal used to form the right hand side of the new nonterminal. :type rhs: Nonterminal :rtype: Nonterminal """ return Nonterminal('%s/%s' % (self._symbol, rhs._symbol)) def nonterminals(symbols): """ Given a string containing a list of symbol names, return a list of ``Nonterminals`` constructed from those symbols. :param symbols: The symbol name string. This string can be delimited by either spaces or commas. :type symbols: str :return: A list of ``Nonterminals`` constructed from the symbol names given in ``symbols``. The ``Nonterminals`` are sorted in the same order as the symbols names. :rtype: list(Nonterminal) """ if ',' in symbols: symbol_list = symbols.split(',') else: symbol_list = symbols.split() return [Nonterminal(s.strip()) for s in symbol_list] class FeatStructNonterminal(FeatDict, Nonterminal): """A feature structure that's also a nonterminal. It acts as its own symbol, and automatically freezes itself when hashed.""" def __hash__(self): self.freeze() return FeatStruct.__hash__(self) def symbol(self): return self def is_nonterminal(item): """ :return: True if the item is a ``Nonterminal``. :rtype: bool """ return isinstance(item, Nonterminal) ################################################################# # Terminals ################################################################# def is_terminal(item): """ Return True if the item is a terminal, which currently is if it is hashable and not a ``Nonterminal``. :rtype: bool """ return hasattr(item, '__hash__') and not isinstance(item, Nonterminal) ################################################################# # Productions ################################################################# @total_ordering @python_2_unicode_compatible class Production(object): """ A grammar production. Each production maps a single symbol on the "left-hand side" to a sequence of symbols on the "right-hand side". (In the case of context-free productions, the left-hand side must be a ``Nonterminal``, and the right-hand side is a sequence of terminals and ``Nonterminals``.) "terminals" can be any immutable hashable object that is not a ``Nonterminal``. Typically, terminals are strings representing words, such as ``"dog"`` or ``"under"``. :see: ``CFG`` :see: ``DependencyGrammar`` :see: ``Nonterminal`` :type _lhs: Nonterminal :ivar _lhs: The left-hand side of the production. :type _rhs: tuple(Nonterminal, terminal) :ivar _rhs: The right-hand side of the production. """ def __init__(self, lhs, rhs): """ Construct a new ``Production``. :param lhs: The left-hand side of the new ``Production``. :type lhs: Nonterminal :param rhs: The right-hand side of the new ``Production``. :type rhs: sequence(Nonterminal and terminal) """ if isinstance(rhs, string_types): raise TypeError('production right hand side should be a list, ' 'not a string') self._lhs = lhs self._rhs = tuple(rhs) self._hash = hash((self._lhs, self._rhs)) def lhs(self): """ Return the left-hand side of this ``Production``. :rtype: Nonterminal """ return self._lhs def rhs(self): """ Return the right-hand side of this ``Production``. :rtype: sequence(Nonterminal and terminal) """ return self._rhs def __len__(self): """ Return the length of the right-hand side. :rtype: int """ return len(self._rhs) def is_nonlexical(self): """ Return True if the right-hand side only contains ``Nonterminals`` :rtype: bool """ return all(is_nonterminal(n) for n in self._rhs) def is_lexical(self): """ Return True if the right-hand contain at least one terminal token. :rtype: bool """ return not self.is_nonlexical() def __str__(self): """ Return a verbose string representation of the ``Production``. :rtype: str """ result = '%s -> ' % unicode_repr(self._lhs) result += " ".join(unicode_repr(el) for el in self._rhs) return result def __repr__(self): """ Return a concise string representation of the ``Production``. :rtype: str """ return '%s' % self def __eq__(self, other): """ Return True if this ``Production`` is equal to ``other``. :rtype: bool """ return (type(self) == type(other) and self._lhs == other._lhs and self._rhs == other._rhs) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, Production): raise_unorderable_types("<", self, other) return (self._lhs, self._rhs) < (other._lhs, other._rhs) def __hash__(self): """ Return a hash value for the ``Production``. :rtype: int """ return self._hash @python_2_unicode_compatible class DependencyProduction(Production): """ A dependency grammar production. Each production maps a single head word to an unordered list of one or more modifier words. """ def __str__(self): """ Return a verbose string representation of the ``DependencyProduction``. :rtype: str """ result = '\'%s\' ->' % (self._lhs,) for elt in self._rhs: result += ' \'%s\'' % (elt,) return result @python_2_unicode_compatible class ProbabilisticProduction(Production, ImmutableProbabilisticMixIn): """ A probabilistic context free grammar production. A PCFG ``ProbabilisticProduction`` is essentially just a ``Production`` that has an associated probability, which represents how likely it is that this production will be used. In particular, the probability of a ``ProbabilisticProduction`` records the likelihood that its right-hand side is the correct instantiation for any given occurrence of its left-hand side. :see: ``Production`` """ def __init__(self, lhs, rhs, **prob): """ Construct a new ``ProbabilisticProduction``. :param lhs: The left-hand side of the new ``ProbabilisticProduction``. :type lhs: Nonterminal :param rhs: The right-hand side of the new ``ProbabilisticProduction``. :type rhs: sequence(Nonterminal and terminal) :param prob: Probability parameters of the new ``ProbabilisticProduction``. """ ImmutableProbabilisticMixIn.__init__(self, **prob) Production.__init__(self, lhs, rhs) def __str__(self): return Production.__unicode__(self) + \ (' [1.0]' if (self.prob() == 1.0) else ' [%g]' % self.prob()) def __eq__(self, other): return (type(self) == type(other) and self._lhs == other._lhs and self._rhs == other._rhs and self.prob() == other.prob()) def __ne__(self, other): return not self == other def __hash__(self): return hash((self._lhs, self._rhs, self.prob())) ################################################################# # Grammars ################################################################# @python_2_unicode_compatible class CFG(object): """ A context-free grammar. A grammar consists of a start state and a set of productions. The set of terminals and nonterminals is implicitly specified by the productions. If you need efficient key-based access to productions, you can use a subclass to implement it. """ def __init__(self, start, productions, calculate_leftcorners=True): """ Create a new context-free grammar, from the given start state and set of ``Production``s. :param start: The start symbol :type start: Nonterminal :param productions: The list of productions that defines the grammar :type productions: list(Production) :param calculate_leftcorners: False if we don't want to calculate the leftcorner relation. In that case, some optimized chart parsers won't work. :type calculate_leftcorners: bool """ if not is_nonterminal(start): raise TypeError("start should be a Nonterminal object," " not a %s" % type(start).__name__) self._start = start self._productions = productions self._categories = set(prod.lhs() for prod in productions) self._calculate_indexes() self._calculate_grammar_forms() if calculate_leftcorners: self._calculate_leftcorners() def _calculate_indexes(self): self._lhs_index = {} self._rhs_index = {} self._empty_index = {} self._lexical_index = {} for prod in self._productions: # Left hand side. lhs = prod._lhs if lhs not in self._lhs_index: self._lhs_index[lhs] = [] self._lhs_index[lhs].append(prod) if prod._rhs: # First item in right hand side. rhs0 = prod._rhs[0] if rhs0 not in self._rhs_index: self._rhs_index[rhs0] = [] self._rhs_index[rhs0].append(prod) else: # The right hand side is empty. self._empty_index[prod.lhs()] = prod # Lexical tokens in the right hand side. for token in prod._rhs: if is_terminal(token): self._lexical_index.setdefault(token, set()).add(prod) def _calculate_leftcorners(self): # Calculate leftcorner relations, for use in optimized parsing. self._immediate_leftcorner_categories = dict((cat, set([cat])) for cat in self._categories) self._immediate_leftcorner_words = dict((cat, set()) for cat in self._categories) for prod in self.productions(): if len(prod) > 0: cat, left = prod.lhs(), prod.rhs()[0] if is_nonterminal(left): self._immediate_leftcorner_categories[cat].add(left) else: self._immediate_leftcorner_words[cat].add(left) lc = transitive_closure(self._immediate_leftcorner_categories, reflexive=True) self._leftcorners = lc self._leftcorner_parents = invert_graph(lc) nr_leftcorner_categories = sum(map(len, self._immediate_leftcorner_categories.values())) nr_leftcorner_words = sum(map(len, self._immediate_leftcorner_words.values())) if nr_leftcorner_words > nr_leftcorner_categories > 10000: # If the grammar is big, the leftcorner-word dictionary will be too large. # In that case it is better to calculate the relation on demand. self._leftcorner_words = None return self._leftcorner_words = {} for cat in self._leftcorners: lefts = self._leftcorners[cat] lc = self._leftcorner_words[cat] = set() for left in lefts: lc.update(self._immediate_leftcorner_words.get(left, set())) @classmethod def fromstring(cls, input, encoding=None): """ Return the ``CFG`` corresponding to the input string(s). :param input: a grammar, either in the form of a string or as a list of strings. """ start, productions = read_grammar(input, standard_nonterm_parser, encoding=encoding) return CFG(start, productions) def start(self): """ Return the start symbol of the grammar :rtype: Nonterminal """ return self._start # tricky to balance readability and efficiency here! # can't use set operations as they don't preserve ordering def productions(self, lhs=None, rhs=None, empty=False): """ Return the grammar productions, filtered by the left-hand side or the first item in the right-hand side. :param lhs: Only return productions with the given left-hand side. :param rhs: Only return productions with the given first item in the right-hand side. :param empty: Only return productions with an empty right-hand side. :return: A list of productions matching the given constraints. :rtype: list(Production) """ if rhs and empty: raise ValueError("You cannot select empty and non-empty " "productions at the same time.") # no constraints so return everything if not lhs and not rhs: if not empty: return self._productions else: return self._empty_index.values() # only lhs specified so look up its index elif lhs and not rhs: if not empty: return self._lhs_index.get(lhs, []) elif lhs in self._empty_index: return [self._empty_index[lhs]] else: return [] # only rhs specified so look up its index elif rhs and not lhs: return self._rhs_index.get(rhs, []) # intersect else: return [prod for prod in self._lhs_index.get(lhs, []) if prod in self._rhs_index.get(rhs, [])] def leftcorners(self, cat): """ Return the set of all nonterminals that the given nonterminal can start with, including itself. This is the reflexive, transitive closure of the immediate leftcorner relation: (A > B) iff (A -> B beta) :param cat: the parent of the leftcorners :type cat: Nonterminal :return: the set of all leftcorners :rtype: set(Nonterminal) """ return self._leftcorners.get(cat, set([cat])) def is_leftcorner(self, cat, left): """ True if left is a leftcorner of cat, where left can be a terminal or a nonterminal. :param cat: the parent of the leftcorner :type cat: Nonterminal :param left: the suggested leftcorner :type left: Terminal or Nonterminal :rtype: bool """ if is_nonterminal(left): return left in self.leftcorners(cat) elif self._leftcorner_words: return left in self._leftcorner_words.get(cat, set()) else: return any(left in self._immediate_leftcorner_words.get(parent, set()) for parent in self.leftcorners(cat)) def leftcorner_parents(self, cat): """ Return the set of all nonterminals for which the given category is a left corner. This is the inverse of the leftcorner relation. :param cat: the suggested leftcorner :type cat: Nonterminal :return: the set of all parents to the leftcorner :rtype: set(Nonterminal) """ return self._leftcorner_parents.get(cat, set([cat])) def check_coverage(self, tokens): """ Check whether the grammar rules cover the given list of tokens. If not, then raise an exception. :type tokens: list(str) """ missing = [tok for tok in tokens if not self._lexical_index.get(tok)] if missing: missing = ', '.join('%r' % (w,) for w in missing) raise ValueError("Grammar does not cover some of the " "input words: %r." % missing) def _calculate_grammar_forms(self): """ Pre-calculate of which form(s) the grammar is. """ prods = self._productions self._is_lexical = all(p.is_lexical() for p in prods) self._is_nonlexical = all(p.is_nonlexical() for p in prods if len(p) != 1) self._min_len = min(len(p) for p in prods) self._max_len = max(len(p) for p in prods) self._all_unary_are_lexical = all(p.is_lexical() for p in prods if len(p) == 1) def is_lexical(self): """ Return True if all productions are lexicalised. """ return self._is_lexical def is_nonlexical(self): """ Return True if all lexical rules are "preterminals", that is, unary rules which can be separated in a preprocessing step. This means that all productions are of the forms A -> B1 ... Bn (n>=0), or A -> "s". Note: is_lexical() and is_nonlexical() are not opposites. There are grammars which are neither, and grammars which are both. """ return self._is_nonlexical def min_len(self): """ Return the right-hand side length of the shortest grammar production. """ return self._min_len def max_len(self): """ Return the right-hand side length of the longest grammar production. """ return self._max_len def is_nonempty(self): """ Return True if there are no empty productions. """ return self._min_len > 0 def is_binarised(self): """ Return True if all productions are at most binary. Note that there can still be empty and unary productions. """ return self._max_len <= 2 def is_flexible_chomsky_normal_form(self): """ Return True if all productions are of the forms A -> B C, A -> B, or A -> "s". """ return self.is_nonempty() and self.is_nonlexical() and self.is_binarised() def is_chomsky_normal_form(self): """ Return True if the grammar is of Chomsky Normal Form, i.e. all productions are of the form A -> B C, or A -> "s". """ return (self.is_flexible_chomsky_normal_form() and self._all_unary_are_lexical) def __repr__(self): return '' % len(self._productions) def __str__(self): result = 'Grammar with %d productions' % len(self._productions) result += ' (start state = %r)' % self._start for production in self._productions: result += '\n %s' % production return result class FeatureGrammar(CFG): """ A feature-based grammar. This is equivalent to a ``CFG`` whose nonterminals are all ``FeatStructNonterminal``. A grammar consists of a start state and a set of productions. The set of terminals and nonterminals is implicitly specified by the productions. """ def __init__(self, start, productions): """ Create a new feature-based grammar, from the given start state and set of ``Productions``. :param start: The start symbol :type start: FeatStructNonterminal :param productions: The list of productions that defines the grammar :type productions: list(Production) """ CFG.__init__(self, start, productions) # The difference with CFG is that the productions are # indexed on the TYPE feature of the nonterminals. # This is calculated by the method _get_type_if_possible(). def _calculate_indexes(self): self._lhs_index = {} self._rhs_index = {} self._empty_index = {} self._empty_productions = [] self._lexical_index = {} for prod in self._productions: # Left hand side. lhs = self._get_type_if_possible(prod._lhs) if lhs not in self._lhs_index: self._lhs_index[lhs] = [] self._lhs_index[lhs].append(prod) if prod._rhs: # First item in right hand side. rhs0 = self._get_type_if_possible(prod._rhs[0]) if rhs0 not in self._rhs_index: self._rhs_index[rhs0] = [] self._rhs_index[rhs0].append(prod) else: # The right hand side is empty. if lhs not in self._empty_index: self._empty_index[lhs] = [] self._empty_index[lhs].append(prod) self._empty_productions.append(prod) # Lexical tokens in the right hand side. for token in prod._rhs: if is_terminal(token): self._lexical_index.setdefault(token, set()).add(prod) @classmethod def fromstring(cls, input, features=None, logic_parser=None, fstruct_reader=None, encoding=None): """ Return a feature structure based ``FeatureGrammar``. :param input: a grammar, either in the form of a string or else as a list of strings. :param features: a tuple of features (default: SLASH, TYPE) :param logic_parser: a parser for lambda-expressions, by default, ``LogicParser()`` :param fstruct_reader: a feature structure parser (only if features and logic_parser is None) """ if features is None: features = (SLASH, TYPE) if fstruct_reader is None: fstruct_reader = FeatStructReader(features, FeatStructNonterminal, logic_parser=logic_parser) elif logic_parser is not None: raise Exception('\'logic_parser\' and \'fstruct_reader\' must ' 'not both be set') start, productions = read_grammar(input, fstruct_reader.read_partial, encoding=encoding) return FeatureGrammar(start, productions) def productions(self, lhs=None, rhs=None, empty=False): """ Return the grammar productions, filtered by the left-hand side or the first item in the right-hand side. :param lhs: Only return productions with the given left-hand side. :param rhs: Only return productions with the given first item in the right-hand side. :param empty: Only return productions with an empty right-hand side. :rtype: list(Production) """ if rhs and empty: raise ValueError("You cannot select empty and non-empty " "productions at the same time.") # no constraints so return everything if not lhs and not rhs: if empty: return self._empty_productions else: return self._productions # only lhs specified so look up its index elif lhs and not rhs: if empty: return self._empty_index.get(self._get_type_if_possible(lhs), []) else: return self._lhs_index.get(self._get_type_if_possible(lhs), []) # only rhs specified so look up its index elif rhs and not lhs: return self._rhs_index.get(self._get_type_if_possible(rhs), []) # intersect else: return [prod for prod in self._lhs_index.get(self._get_type_if_possible(lhs), []) if prod in self._rhs_index.get(self._get_type_if_possible(rhs), [])] def leftcorners(self, cat): """ Return the set of all words that the given category can start with. Also called the "first set" in compiler construction. """ raise NotImplementedError("Not implemented yet") def leftcorner_parents(self, cat): """ Return the set of all categories for which the given category is a left corner. """ raise NotImplementedError("Not implemented yet") def _get_type_if_possible(self, item): """ Helper function which returns the ``TYPE`` feature of the ``item``, if it exists, otherwise it returns the ``item`` itself """ if isinstance(item, dict) and TYPE in item: return FeatureValueType(item[TYPE]) else: return item @total_ordering @python_2_unicode_compatible class FeatureValueType(object): """ A helper class for ``FeatureGrammars``, designed to be different from ordinary strings. This is to stop the ``FeatStruct`` ``FOO[]`` from being compare equal to the terminal "FOO". """ def __init__(self, value): self._value = value self._hash = hash(value) def __repr__(self): return '<%s>' % self._value def __eq__(self, other): return type(self) == type(other) and self._value == other._value def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, FeatureValueType): raise_unorderable_types("<", self, other) return self._value < other._value def __hash__(self): return self._hash @python_2_unicode_compatible class DependencyGrammar(object): """ A dependency grammar. A DependencyGrammar consists of a set of productions. Each production specifies a head/modifier relationship between a pair of words. """ def __init__(self, productions): """ Create a new dependency grammar, from the set of ``Productions``. :param productions: The list of productions that defines the grammar :type productions: list(Production) """ self._productions = productions @classmethod def fromstring(cls, input): productions = [] for linenum, line in enumerate(input.split('\n')): line = line.strip() if line.startswith('#') or line=='': continue try: productions += _read_dependency_production(line) except ValueError: raise ValueError('Unable to parse line %s: %s' % (linenum, line)) if len(productions) == 0: raise ValueError('No productions found!') return DependencyGrammar(productions) def contains(self, head, mod): """ :param head: A head word. :type head: str :param mod: A mod word, to test as a modifier of 'head'. :type mod: str :return: true if this ``DependencyGrammar`` contains a ``DependencyProduction`` mapping 'head' to 'mod'. :rtype: bool """ for production in self._productions: for possibleMod in production._rhs: if(production._lhs == head and possibleMod == mod): return True return False def __contains__(self, head, mod): """ Return True if this ``DependencyGrammar`` contains a ``DependencyProduction`` mapping 'head' to 'mod'. :param head: A head word. :type head: str :param mod: A mod word, to test as a modifier of 'head'. :type mod: str :rtype: bool """ for production in self._productions: for possibleMod in production._rhs: if(production._lhs == head and possibleMod == mod): return True return False # # should be rewritten, the set comp won't work in all comparisons # def contains_exactly(self, head, modlist): # for production in self._productions: # if(len(production._rhs) == len(modlist)): # if(production._lhs == head): # set1 = Set(production._rhs) # set2 = Set(modlist) # if(set1 == set2): # return True # return False def __str__(self): """ Return a verbose string representation of the ``DependencyGrammar`` :rtype: str """ str = 'Dependency grammar with %d productions' % len(self._productions) for production in self._productions: str += '\n %s' % production return str def __repr__(self): """ Return a concise string representation of the ``DependencyGrammar`` """ return 'Dependency grammar with %d productions' % len(self._productions) @python_2_unicode_compatible class ProbabilisticDependencyGrammar(object): """ """ def __init__(self, productions, events, tags): self._productions = productions self._events = events self._tags = tags def contains(self, head, mod): """ Return True if this ``DependencyGrammar`` contains a ``DependencyProduction`` mapping 'head' to 'mod'. :param head: A head word. :type head: str :param mod: A mod word, to test as a modifier of 'head'. :type mod: str :rtype: bool """ for production in self._productions: for possibleMod in production._rhs: if(production._lhs == head and possibleMod == mod): return True return False def __str__(self): """ Return a verbose string representation of the ``ProbabilisticDependencyGrammar`` :rtype: str """ str = 'Statistical dependency grammar with %d productions' % len(self._productions) for production in self._productions: str += '\n %s' % production str += '\nEvents:' for event in self._events: str += '\n %d:%s' % (self._events[event], event) str += '\nTags:' for tag_word in self._tags: str += '\n %s:\t(%s)' % (tag_word, self._tags[tag_word]) return str def __repr__(self): """ Return a concise string representation of the ``ProbabilisticDependencyGrammar`` """ return 'Statistical Dependency grammar with %d productions' % len(self._productions) class PCFG(CFG): """ A probabilistic context-free grammar. A PCFG consists of a start state and a set of productions with probabilities. The set of terminals and nonterminals is implicitly specified by the productions. PCFG productions use the ``ProbabilisticProduction`` class. ``PCFGs`` impose the constraint that the set of productions with any given left-hand-side must have probabilities that sum to 1 (allowing for a small margin of error). If you need efficient key-based access to productions, you can use a subclass to implement it. :type EPSILON: float :cvar EPSILON: The acceptable margin of error for checking that productions with a given left-hand side have probabilities that sum to 1. """ EPSILON = 0.01 def __init__(self, start, productions, calculate_leftcorners=True): """ Create a new context-free grammar, from the given start state and set of ``ProbabilisticProductions``. :param start: The start symbol :type start: Nonterminal :param productions: The list of productions that defines the grammar :type productions: list(Production) :raise ValueError: if the set of productions with any left-hand-side do not have probabilities that sum to a value within EPSILON of 1. :param calculate_leftcorners: False if we don't want to calculate the leftcorner relation. In that case, some optimized chart parsers won't work. :type calculate_leftcorners: bool """ CFG.__init__(self, start, productions, calculate_leftcorners) # Make sure that the probabilities sum to one. probs = {} for production in productions: probs[production.lhs()] = (probs.get(production.lhs(), 0) + production.prob()) for (lhs, p) in probs.items(): if not ((1-PCFG.EPSILON) < p < (1+PCFG.EPSILON)): raise ValueError("Productions for %r do not sum to 1" % lhs) @classmethod def fromstring(cls, input, encoding=None): """ Return a probabilistic ``PCFG`` corresponding to the input string(s). :param input: a grammar, either in the form of a string or else as a list of strings. """ start, productions = read_grammar(input, standard_nonterm_parser, probabilistic=True, encoding=encoding) return PCFG(start, productions) ################################################################# # Inducing Grammars ################################################################# # Contributed by Nathan Bodenstab def induce_pcfg(start, productions): """ Induce a PCFG grammar from a list of productions. The probability of a production A -> B C in a PCFG is: | count(A -> B C) | P(B, C | A) = --------------- where \* is any right hand side | count(A -> \*) :param start: The start symbol :type start: Nonterminal :param productions: The list of productions that defines the grammar :type productions: list(Production) """ # Production count: the number of times a given production occurs pcount = {} # LHS-count: counts the number of times a given lhs occurs lcount = {} for prod in productions: lcount[prod.lhs()] = lcount.get(prod.lhs(), 0) + 1 pcount[prod] = pcount.get(prod, 0) + 1 prods = [ProbabilisticProduction(p.lhs(), p.rhs(), prob=float(pcount[p]) / lcount[p.lhs()]) for p in pcount] return PCFG(start, prods) ################################################################# # Helper functions for reading productions ################################################################# def _read_cfg_production(input): """ Return a list of context-free ``Productions``. """ return _read_production(input, standard_nonterm_parser) def _read_pcfg_production(input): """ Return a list of PCFG ``ProbabilisticProductions``. """ return _read_production(input, standard_nonterm_parser, probabilistic=True) def _read_fcfg_production(input, fstruct_reader): """ Return a list of feature-based ``Productions``. """ return _read_production(input, fstruct_reader) # Parsing generic grammars _ARROW_RE = re.compile(r'\s* -> \s*', re.VERBOSE) _PROBABILITY_RE = re.compile(r'( \[ [\d\.]+ \] ) \s*', re.VERBOSE) _TERMINAL_RE = re.compile(r'( "[^"]+" | \'[^\']+\' ) \s*', re.VERBOSE) _DISJUNCTION_RE = re.compile(r'\| \s*', re.VERBOSE) def _read_production(line, nonterm_parser, probabilistic=False): """ Parse a grammar rule, given as a string, and return a list of productions. """ pos = 0 # Parse the left-hand side. lhs, pos = nonterm_parser(line, pos) # Skip over the arrow. m = _ARROW_RE.match(line, pos) if not m: raise ValueError('Expected an arrow') pos = m.end() # Parse the right hand side. probabilities = [0.0] rhsides = [[]] while pos < len(line): # Probability. m = _PROBABILITY_RE.match(line, pos) if probabilistic and m: pos = m.end() probabilities[-1] = float(m.group(1)[1:-1]) if probabilities[-1] > 1.0: raise ValueError('Production probability %f, ' 'should not be greater than 1.0' % (probabilities[-1],)) # String -- add terminal. elif line[pos] in "\'\"": m = _TERMINAL_RE.match(line, pos) if not m: raise ValueError('Unterminated string') rhsides[-1].append(m.group(1)[1:-1]) pos = m.end() # Vertical bar -- start new rhside. elif line[pos] == '|': m = _DISJUNCTION_RE.match(line, pos) probabilities.append(0.0) rhsides.append([]) pos = m.end() # Anything else -- nonterminal. else: nonterm, pos = nonterm_parser(line, pos) rhsides[-1].append(nonterm) if probabilistic: return [ProbabilisticProduction(lhs, rhs, prob=probability) for (rhs, probability) in zip(rhsides, probabilities)] else: return [Production(lhs, rhs) for rhs in rhsides] ################################################################# # Reading Phrase Structure Grammars ################################################################# def read_grammar(input, nonterm_parser, probabilistic=False, encoding=None): """ Return a pair consisting of a starting category and a list of ``Productions``. :param input: a grammar, either in the form of a string or else as a list of strings. :param nonterm_parser: a function for parsing nonterminals. It should take a ``(string, position)`` as argument and return a ``(nonterminal, position)`` as result. :param probabilistic: are the grammar rules probabilistic? :type probabilistic: bool :param encoding: the encoding of the grammar, if it is a binary string :type encoding: str """ if encoding is not None: input = input.decode(encoding) if isinstance(input, string_types): lines = input.split('\n') else: lines = input start = None productions = [] continue_line = '' for linenum, line in enumerate(lines): line = continue_line + line.strip() if line.startswith('#') or line=='': continue if line.endswith('\\'): continue_line = line[:-1].rstrip()+' ' continue continue_line = '' try: if line[0] == '%': directive, args = line[1:].split(None, 1) if directive == 'start': start, pos = nonterm_parser(args, 0) if pos != len(args): raise ValueError('Bad argument to start directive') else: raise ValueError('Bad directive') else: # expand out the disjunctions on the RHS productions += _read_production(line, nonterm_parser, probabilistic) except ValueError as e: raise ValueError('Unable to parse line %s: %s\n%s' % (linenum+1, line, e)) if not productions: raise ValueError('No productions found!') if not start: start = productions[0].lhs() return (start, productions) _STANDARD_NONTERM_RE = re.compile('( [\w/][\w/^<>-]* ) \s*', re.VERBOSE) def standard_nonterm_parser(string, pos): m = _STANDARD_NONTERM_RE.match(string, pos) if not m: raise ValueError('Expected a nonterminal, found: ' + string[pos:]) return (Nonterminal(m.group(1)), m.end()) ################################################################# # Reading Dependency Grammars ################################################################# _READ_DG_RE = re.compile(r'''^\s* # leading whitespace ('[^']+')\s* # single-quoted lhs (?:[-=]+>)\s* # arrow (?:( # rhs: "[^"]+" # doubled-quoted terminal | '[^']+' # single-quoted terminal | \| # disjunction ) \s*) # trailing space *$''', # zero or more copies re.VERBOSE) _SPLIT_DG_RE = re.compile(r'''('[^']'|[-=]+>|"[^"]+"|'[^']+'|\|)''') def _read_dependency_production(s): if not _READ_DG_RE.match(s): raise ValueError('Bad production string') pieces = _SPLIT_DG_RE.split(s) pieces = [p for i,p in enumerate(pieces) if i%2==1] lhside = pieces[0].strip('\'\"') rhsides = [[]] for piece in pieces[2:]: if piece == '|': rhsides.append([]) else: rhsides[-1].append(piece.strip('\'\"')) return [DependencyProduction(lhside, rhside) for rhside in rhsides] ################################################################# # Demonstration ################################################################# def cfg_demo(): """ A demonstration showing how ``CFGs`` can be created and used. """ from nltk import nonterminals, Production, CFG # Create some nonterminals S, NP, VP, PP = nonterminals('S, NP, VP, PP') N, V, P, Det = nonterminals('N, V, P, Det') VP_slash_NP = VP/NP print('Some nonterminals:', [S, NP, VP, PP, N, V, P, Det, VP/NP]) print(' S.symbol() =>', repr(S.symbol())) print() print(Production(S, [NP])) # Create some Grammar Productions grammar = CFG.fromstring(""" S -> NP VP PP -> P NP NP -> Det N | NP PP VP -> V NP | VP PP Det -> 'a' | 'the' N -> 'dog' | 'cat' V -> 'chased' | 'sat' P -> 'on' | 'in' """) print('A Grammar:', repr(grammar)) print(' grammar.start() =>', repr(grammar.start())) print(' grammar.productions() =>', end=' ') # Use string.replace(...) is to line-wrap the output. print(repr(grammar.productions()).replace(',', ',\n'+' '*25)) print() toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) toy_pcfg2 = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """) def pcfg_demo(): """ A demonstration showing how a ``PCFG`` can be created and used. """ from nltk.corpus import treebank from nltk import treetransforms from nltk import induce_pcfg from nltk.parse import pchart pcfg_prods = toy_pcfg1.productions() pcfg_prod = pcfg_prods[2] print('A PCFG production:', repr(pcfg_prod)) print(' pcfg_prod.lhs() =>', repr(pcfg_prod.lhs())) print(' pcfg_prod.rhs() =>', repr(pcfg_prod.rhs())) print(' pcfg_prod.prob() =>', repr(pcfg_prod.prob())) print() grammar = toy_pcfg2 print('A PCFG grammar:', repr(grammar)) print(' grammar.start() =>', repr(grammar.start())) print(' grammar.productions() =>', end=' ') # Use .replace(...) is to line-wrap the output. print(repr(grammar.productions()).replace(',', ',\n'+' '*26)) print() # extract productions from three trees and induce the PCFG print("Induce PCFG grammar from treebank data:") productions = [] item = treebank._fileids[0] for tree in treebank.parsed_sents(item)[:3]: # perform optional tree transformations, e.g.: tree.collapse_unary(collapsePOS = False) tree.chomsky_normal_form(horzMarkov = 2) productions += tree.productions() S = Nonterminal('S') grammar = induce_pcfg(S, productions) print(grammar) print() print("Parse sentence using induced grammar:") parser = pchart.InsideChartParser(grammar) parser.trace(3) # doesn't work as tokens are different: #sent = treebank.tokenized('wsj_0001.mrg')[0] sent = treebank.parsed_sents(item)[0].leaves() print(sent) for parse in parser.parse(sent): print(parse) def fcfg_demo(): import nltk.data g = nltk.data.load('grammars/book_grammars/feat0.fcfg') print(g) print() def dg_demo(): """ A demonstration showing the creation and inspection of a ``DependencyGrammar``. """ grammar = DependencyGrammar.fromstring(""" 'scratch' -> 'cats' | 'walls' 'walls' -> 'the' 'cats' -> 'the' """) print(grammar) def sdg_demo(): """ A demonstration of how to read a string representation of a CoNLL format dependency tree. """ from nltk.parse import DependencyGraph dg = DependencyGraph(""" 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 met met Prep Prep voor 8 mod _ _ 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ 7 gaan ga V V hulp|inf 6 vc _ _ 8 winkelen winkel V V intrans|inf 11 cnj _ _ 9 , , Punc Punc komma 8 punct _ _ 10 zwemmen zwem V V intrans|inf 11 cnj _ _ 11 of of Conj Conj neven 7 vc _ _ 12 terrassen terras N N soort|mv|neut 11 cnj _ _ 13 . . Punc Punc punt 12 punct _ _ """) tree = dg.tree() print(tree.pprint()) def demo(): cfg_demo() pcfg_demo() fcfg_demo() dg_demo() sdg_demo() if __name__ == '__main__': demo() __all__ = ['Nonterminal', 'nonterminals', 'CFG', 'Production', 'PCFG', 'ProbabilisticProduction', 'DependencyGrammar', 'DependencyProduction', 'ProbabilisticDependencyGrammar', 'induce_pcfg', 'read_grammar'] nltk-3.1/nltk/help.py0000644000076500000240000000316112607224144014331 0ustar sbstaff00000000000000# Natural Language Toolkit (NLTK) Help # # Copyright (C) 2001-2015 NLTK Project # Authors: Steven Bird # URL: # For license information, see LICENSE.TXT """ Provide structured access to documentation. """ from __future__ import print_function import re from textwrap import wrap from nltk.data import load def brown_tagset(tagpattern=None): _format_tagset("brown_tagset", tagpattern) def claws5_tagset(tagpattern=None): _format_tagset("claws5_tagset", tagpattern) def upenn_tagset(tagpattern=None): _format_tagset("upenn_tagset", tagpattern) ##################################################################### # UTILITIES ##################################################################### def _print_entries(tags, tagdict): for tag in tags: entry = tagdict[tag] defn = [tag + ": " + entry[0]] examples = wrap(entry[1], width=75, initial_indent=' ', subsequent_indent=' ') print("\n".join(defn + examples)) def _format_tagset(tagset, tagpattern=None): tagdict = load("help/tagsets/" + tagset + ".pickle") if not tagpattern: _print_entries(sorted(tagdict), tagdict) elif tagpattern in tagdict: _print_entries([tagpattern], tagdict) else: tagpattern = re.compile(tagpattern) tags = [tag for tag in sorted(tagdict) if tagpattern.match(tag)] if tags: _print_entries(tags, tagdict) else: print("No matching tags found.") if __name__ == '__main__': brown_tagset(r'NN.*') upenn_tagset(r'.*\$') claws5_tagset('UNDEFINED') brown_tagset(r'NN') nltk-3.1/nltk/inference/0000755000076500000240000000000012610001541014750 5ustar sbstaff00000000000000nltk-3.1/nltk/inference/__init__.py0000644000076500000240000000142512607224144017077 0ustar sbstaff00000000000000# Natural Language Toolkit: Inference # # Copyright (C) 2001-2015 NLTK Project # Author: Dan Garrette # Ewan Klein # # URL: # For license information, see LICENSE.TXT """ Classes and interfaces for theorem proving and model building. """ from nltk.inference.api import ParallelProverBuilder, ParallelProverBuilderCommand from nltk.inference.mace import Mace, MaceCommand from nltk.inference.prover9 import Prover9, Prover9Command from nltk.inference.resolution import ResolutionProver, ResolutionProverCommand from nltk.inference.tableau import TableauProver, TableauProverCommand from nltk.inference.discourse import (ReadingCommand, CfgReadingCommand, DrtGlueReadingCommand, DiscourseTester) nltk-3.1/nltk/inference/api.py0000644000076500000240000004563512574600335016127 0ustar sbstaff00000000000000# Natural Language Toolkit: Classifier Interface # # Author: Ewan Klein # Dan Garrette # # URL: # For license information, see LICENSE.TXT """ Interfaces and base classes for theorem provers and model builders. ``Prover`` is a standard interface for a theorem prover which tries to prove a goal from a list of assumptions. ``ModelBuilder`` is a standard interface for a model builder. Given just a set of assumptions. the model builder tries to build a model for the assumptions. Given a set of assumptions and a goal *G*, the model builder tries to find a counter-model, in the sense of a model that will satisfy the assumptions plus the negation of *G*. """ from __future__ import print_function import threading import time class Prover(object): """ Interface for trying to prove a goal from assumptions. Both the goal and the assumptions are constrained to be formulas of ``logic.Expression``. """ def prove(self, goal=None, assumptions=None, verbose=False): """ :return: Whether the proof was successful or not. :rtype: bool """ return self._prove(goal, assumptions, verbose)[0] def _prove(self, goal=None, assumptions=None, verbose=False): """ :return: Whether the proof was successful or not, along with the proof :rtype: tuple: (bool, str) """ raise NotImplementedError() class ModelBuilder(object): """ Interface for trying to build a model of set of formulas. Open formulas are assumed to be universally quantified. Both the goal and the assumptions are constrained to be formulas of ``logic.Expression``. """ def build_model(self, goal=None, assumptions=None, verbose=False): """ Perform the actual model building. :return: Whether a model was generated :rtype: bool """ return self._build_model(goal, assumptions, verbose)[0] def _build_model(self, goal=None, assumptions=None, verbose=False): """ Perform the actual model building. :return: Whether a model was generated, and the model itself :rtype: tuple(bool, sem.Valuation) """ raise NotImplementedError() class TheoremToolCommand(object): """ This class holds a goal and a list of assumptions to be used in proving or model building. """ def add_assumptions(self, new_assumptions): """ Add new assumptions to the assumption list. :param new_assumptions: new assumptions :type new_assumptions: list(sem.Expression) """ raise NotImplementedError() def retract_assumptions(self, retracted, debug=False): """ Retract assumptions from the assumption list. :param debug: If True, give warning when ``retracted`` is not present on assumptions list. :type debug: bool :param retracted: assumptions to be retracted :type retracted: list(sem.Expression) """ raise NotImplementedError() def assumptions(self): """ List the current assumptions. :return: list of ``Expression`` """ raise NotImplementedError() def goal(self): """ Return the goal :return: ``Expression`` """ raise NotImplementedError() def print_assumptions(self): """ Print the list of the current assumptions. """ raise NotImplementedError() class ProverCommand(TheoremToolCommand): """ This class holds a ``Prover``, a goal, and a list of assumptions. When prove() is called, the ``Prover`` is executed with the goal and assumptions. """ def prove(self, verbose=False): """ Perform the actual proof. """ raise NotImplementedError() def proof(self, simplify=True): """ Return the proof string :param simplify: bool simplify the proof? :return: str """ raise NotImplementedError() def get_prover(self): """ Return the prover object :return: ``Prover`` """ raise NotImplementedError() class ModelBuilderCommand(TheoremToolCommand): """ This class holds a ``ModelBuilder``, a goal, and a list of assumptions. When build_model() is called, the ``ModelBuilder`` is executed with the goal and assumptions. """ def build_model(self, verbose=False): """ Perform the actual model building. :return: A model if one is generated; None otherwise. :rtype: sem.Valuation """ raise NotImplementedError() def model(self, format=None): """ Return a string representation of the model :param simplify: bool simplify the proof? :return: str """ raise NotImplementedError() def get_model_builder(self): """ Return the model builder object :return: ``ModelBuilder`` """ raise NotImplementedError() class BaseTheoremToolCommand(TheoremToolCommand): """ This class holds a goal and a list of assumptions to be used in proving or model building. """ def __init__(self, goal=None, assumptions=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) """ self._goal = goal if not assumptions: self._assumptions = [] else: self._assumptions = list(assumptions) self._result = None """A holder for the result, to prevent unnecessary re-proving""" def add_assumptions(self, new_assumptions): """ Add new assumptions to the assumption list. :param new_assumptions: new assumptions :type new_assumptions: list(sem.Expression) """ self._assumptions.extend(new_assumptions) self._result = None def retract_assumptions(self, retracted, debug=False): """ Retract assumptions from the assumption list. :param debug: If True, give warning when ``retracted`` is not present on assumptions list. :type debug: bool :param retracted: assumptions to be retracted :type retracted: list(sem.Expression) """ retracted = set(retracted) result_list = list(filter(lambda a: a not in retracted, self._assumptions)) if debug and result_list == self._assumptions: print(Warning("Assumptions list has not been changed:")) self.print_assumptions() self._assumptions = result_list self._result = None def assumptions(self): """ List the current assumptions. :return: list of ``Expression`` """ return self._assumptions def goal(self): """ Return the goal :return: ``Expression`` """ return self._goal def print_assumptions(self): """ Print the list of the current assumptions. """ for a in self.assumptions(): print(a) class BaseProverCommand(BaseTheoremToolCommand, ProverCommand): """ This class holds a ``Prover``, a goal, and a list of assumptions. When prove() is called, the ``Prover`` is executed with the goal and assumptions. """ def __init__(self, prover, goal=None, assumptions=None): """ :param prover: The theorem tool to execute with the assumptions :type prover: Prover :see: ``BaseTheoremToolCommand`` """ self._prover = prover """The theorem tool to execute with the assumptions""" BaseTheoremToolCommand.__init__(self, goal, assumptions) self._proof = None def prove(self, verbose=False): """ Perform the actual proof. Store the result to prevent unnecessary re-proving. """ if self._result is None: self._result, self._proof = self._prover._prove(self.goal(), self.assumptions(), verbose) return self._result def proof(self, simplify=True): """ Return the proof string :param simplify: bool simplify the proof? :return: str """ if self._result is None: raise LookupError("You have to call prove() first to get a proof!") else: return self.decorate_proof(self._proof, simplify) def decorate_proof(self, proof_string, simplify=True): """ Modify and return the proof string :param proof_string: str the proof to decorate :param simplify: bool simplify the proof? :return: str """ return proof_string def get_prover(self): return self._prover class BaseModelBuilderCommand(BaseTheoremToolCommand, ModelBuilderCommand): """ This class holds a ``ModelBuilder``, a goal, and a list of assumptions. When build_model() is called, the ``ModelBuilder`` is executed with the goal and assumptions. """ def __init__(self, modelbuilder, goal=None, assumptions=None): """ :param modelbuilder: The theorem tool to execute with the assumptions :type modelbuilder: ModelBuilder :see: ``BaseTheoremToolCommand`` """ self._modelbuilder = modelbuilder """The theorem tool to execute with the assumptions""" BaseTheoremToolCommand.__init__(self, goal, assumptions) self._model = None def build_model(self, verbose=False): """ Attempt to build a model. Store the result to prevent unnecessary re-building. """ if self._result is None: self._result, self._model = \ self._modelbuilder._build_model(self.goal(), self.assumptions(), verbose) return self._result def model(self, format=None): """ Return a string representation of the model :param simplify: bool simplify the proof? :return: str """ if self._result is None: raise LookupError('You have to call build_model() first to ' 'get a model!') else: return self._decorate_model(self._model, format) def _decorate_model(self, valuation_str, format=None): """ :param valuation_str: str with the model builder's output :param format: str indicating the format for displaying :return: str """ return valuation_str def get_model_builder(self): return self._modelbuilder class TheoremToolCommandDecorator(TheoremToolCommand): """ A base decorator for the ``ProverCommandDecorator`` and ``ModelBuilderCommandDecorator`` classes from which decorators can extend. """ def __init__(self, command): """ :param command: ``TheoremToolCommand`` to decorate """ self._command = command #The decorator has its own versions of 'result' different from the #underlying command self._result = None def assumptions(self): return self._command.assumptions() def goal(self): return self._command.goal() def add_assumptions(self, new_assumptions): self._command.add_assumptions(new_assumptions) self._result = None def retract_assumptions(self, retracted, debug=False): self._command.retract_assumptions(retracted, debug) self._result = None def print_assumptions(self): self._command.print_assumptions() class ProverCommandDecorator(TheoremToolCommandDecorator, ProverCommand): """ A base decorator for the ``ProverCommand`` class from which other prover command decorators can extend. """ def __init__(self, proverCommand): """ :param proverCommand: ``ProverCommand`` to decorate """ TheoremToolCommandDecorator.__init__(self, proverCommand) #The decorator has its own versions of 'result' and 'proof' #because they may be different from the underlying command self._proof = None def prove(self, verbose=False): if self._result is None: prover = self.get_prover() self._result, self._proof = prover._prove(self.goal(), self.assumptions(), verbose) return self._result def proof(self, simplify=True): """ Return the proof string :param simplify: bool simplify the proof? :return: str """ if self._result is None: raise LookupError("You have to call prove() first to get a proof!") else: return self.decorate_proof(self._proof, simplify) def decorate_proof(self, proof_string, simplify=True): """ Modify and return the proof string :param proof_string: str the proof to decorate :param simplify: bool simplify the proof? :return: str """ return self._command.decorate_proof(proof_string, simplify) def get_prover(self): return self._command.get_prover() class ModelBuilderCommandDecorator(TheoremToolCommandDecorator, ModelBuilderCommand): """ A base decorator for the ``ModelBuilderCommand`` class from which other prover command decorators can extend. """ def __init__(self, modelBuilderCommand): """ :param modelBuilderCommand: ``ModelBuilderCommand`` to decorate """ TheoremToolCommandDecorator.__init__(self, modelBuilderCommand) #The decorator has its own versions of 'result' and 'valuation' #because they may be different from the underlying command self._model = None def build_model(self, verbose=False): """ Attempt to build a model. Store the result to prevent unnecessary re-building. """ if self._result is None: modelbuilder = self.get_model_builder() self._result, self._model = \ modelbuilder._build_model(self.goal(), self.assumptions(), verbose) return self._result def model(self, format=None): """ Return a string representation of the model :param simplify: bool simplify the proof? :return: str """ if self._result is None: raise LookupError('You have to call build_model() first to ' 'get a model!') else: return self._decorate_model(self._model, format) def _decorate_model(self, valuation_str, format=None): """ Modify and return the proof string :param valuation_str: str with the model builder's output :param format: str indicating the format for displaying :return: str """ return self._command._decorate_model(valuation_str, format) def get_model_builder(self): return self._command.get_prover() class ParallelProverBuilder(Prover, ModelBuilder): """ This class stores both a prover and a model builder and when either prove() or build_model() is called, then both theorem tools are run in parallel. Whichever finishes first, the prover or the model builder, is the result that will be used. """ def __init__(self, prover, modelbuilder): self._prover = prover self._modelbuilder = modelbuilder def _prove(self, goal=None, assumptions=None, verbose=False): return self._run(goal, assumptions, verbose), '' def _build_model(self, goal=None, assumptions=None, verbose=False): return not self._run(goal, assumptions, verbose), '' def _run(self, goal, assumptions, verbose): # Set up two thread, Prover and ModelBuilder to run in parallel tp_thread = TheoremToolThread(lambda: self._prover.prove(goal, assumptions, verbose), verbose, 'TP') mb_thread = TheoremToolThread(lambda: self._modelbuilder.build_model(goal, assumptions, verbose), verbose, 'MB') tp_thread.start() mb_thread.start() while tp_thread.isAlive() and mb_thread.isAlive(): # wait until either the prover or the model builder is done pass if tp_thread.result is not None: return tp_thread.result elif mb_thread.result is not None: return not mb_thread.result else: return None class ParallelProverBuilderCommand(BaseProverCommand, BaseModelBuilderCommand): """ This command stores both a prover and a model builder and when either prove() or build_model() is called, then both theorem tools are run in parallel. Whichever finishes first, the prover or the model builder, is the result that will be used. Because the theorem prover result is the opposite of the model builder result, we will treat self._result as meaning "proof found/no model found". """ def __init__(self, prover, modelbuilder, goal=None, assumptions=None): BaseProverCommand.__init__(self, prover, goal, assumptions) BaseModelBuilderCommand.__init__(self, modelbuilder, goal, assumptions) def prove(self, verbose=False): return self._run(verbose) def build_model(self, verbose=False): return not self._run(verbose) def _run(self, verbose): # Set up two thread, Prover and ModelBuilder to run in parallel tp_thread = TheoremToolThread(lambda: BaseProverCommand.prove(self, verbose), verbose, 'TP') mb_thread = TheoremToolThread(lambda: BaseModelBuilderCommand.build_model(self, verbose), verbose, 'MB') tp_thread.start() mb_thread.start() while tp_thread.isAlive() and mb_thread.isAlive(): # wait until either the prover or the model builder is done pass if tp_thread.result is not None: self._result = tp_thread.result elif mb_thread.result is not None: self._result = not mb_thread.result return self._result class TheoremToolThread(threading.Thread): def __init__(self, command, verbose, name=None): threading.Thread.__init__(self) self._command = command self._result = None self._verbose = verbose self._name = name def run(self): try: self._result = self._command() if self._verbose: print('Thread %s finished with result %s at %s' % \ (self._name, self._result, time.localtime(time.time()))) except Exception as e: print(e) print('Thread %s completed abnormally' % (self._name)) @property def result(self): return self._result nltk-3.1/nltk/inference/discourse.py0000644000076500000240000005256112574600335017352 0ustar sbstaff00000000000000# Natural Language Toolkit: Discourse Processing # # Author: Ewan Klein # Dan Garrette # # URL: # For license information, see LICENSE.TXT """ Module for incrementally developing simple discourses, and checking for semantic ambiguity, consistency and informativeness. Many of the ideas are based on the CURT family of programs of Blackburn and Bos (see http://homepages.inf.ed.ac.uk/jbos/comsem/book1.html). Consistency checking is carried out by using the ``mace`` module to call the Mace4 model builder. Informativeness checking is carried out with a call to ``Prover.prove()`` from the ``inference`` module. ``DiscourseTester`` is a constructor for discourses. The basic data structure is a list of sentences, stored as ``self._sentences``. Each sentence in the list is assigned a "sentence ID" (``sid``) of the form ``s``\ *i*. For example:: s0: A boxer walks s1: Every boxer chases a girl Each sentence can be ambiguous between a number of readings, each of which receives a "reading ID" (``rid``) of the form ``s``\ *i* -``r``\ *j*. For example:: s0 readings: s0-r1: some x.(boxer(x) & walk(x)) s0-r0: some x.(boxerdog(x) & walk(x)) A "thread" is a list of readings, represented as a list of ``rid``\ s. Each thread receives a "thread ID" (``tid``) of the form ``d``\ *i*. For example:: d0: ['s0-r0', 's1-r0'] The set of all threads for a discourse is the Cartesian product of all the readings of the sequences of sentences. (This is not intended to scale beyond very short discourses!) The method ``readings(filter=True)`` will only show those threads which are consistent (taking into account any background assumptions). """ from __future__ import print_function import os from operator import and_, add from functools import reduce from nltk.data import show_cfg from nltk.tag import RegexpTagger from nltk.parse import load_parser from nltk.parse.malt import MaltParser from nltk.sem.drt import resolve_anaphora, AnaphoraResolutionException from nltk.sem.glue import DrtGlue from nltk.sem.logic import Expression from nltk.inference.mace import MaceCommand from nltk.inference.prover9 import Prover9Command class ReadingCommand(object): def parse_to_readings(self, sentence): """ :param sentence: the sentence to read :type sentence: str """ raise NotImplementedError() def process_thread(self, sentence_readings): """ This method should be used to handle dependencies between readings such as resolving anaphora. :param sentence_readings: readings to process :type sentence_readings: list(Expression) :return: the list of readings after processing :rtype: list(Expression) """ return sentence_readings def combine_readings(self, readings): """ :param readings: readings to combine :type readings: list(Expression) :return: one combined reading :rtype: Expression """ raise NotImplementedError() def to_fol(self, expression): """ Convert this expression into a First-Order Logic expression. :param expression: an expression :type expression: Expression :return: a FOL version of the input expression :rtype: Expression """ raise NotImplementedError() class CfgReadingCommand(ReadingCommand): def __init__(self, gramfile=None): """ :param gramfile: name of file where grammar can be loaded :type gramfile: str """ self._gramfile = (gramfile if gramfile else 'grammars/book_grammars/discourse.fcfg') self._parser = load_parser(self._gramfile) def parse_to_readings(self, sentence): """:see: ReadingCommand.parse_to_readings()""" from nltk.sem import root_semrep tokens = sentence.split() trees = self._parser.parse(tokens) return [root_semrep(tree) for tree in trees] def combine_readings(self, readings): """:see: ReadingCommand.combine_readings()""" return reduce(and_, readings) def to_fol(self, expression): """:see: ReadingCommand.to_fol()""" return expression class DrtGlueReadingCommand(ReadingCommand): def __init__(self, semtype_file=None, remove_duplicates=False, depparser=None): """ :param semtype_file: name of file where grammar can be loaded :param remove_duplicates: should duplicates be removed? :param depparser: the dependency parser """ if semtype_file is None: semtype_file = os.path.join('grammars', 'sample_grammars','drt_glue.semtype') self._glue = DrtGlue(semtype_file=semtype_file, remove_duplicates=remove_duplicates, depparser=depparser) def parse_to_readings(self, sentence): """:see: ReadingCommand.parse_to_readings()""" return self._glue.parse_to_meaning(sentence) def process_thread(self, sentence_readings): """:see: ReadingCommand.process_thread()""" try: return [self.combine_readings(sentence_readings)] except AnaphoraResolutionException: return [] def combine_readings(self, readings): """:see: ReadingCommand.combine_readings()""" thread_reading = reduce(add, readings) return resolve_anaphora(thread_reading.simplify()) def to_fol(self, expression): """:see: ReadingCommand.to_fol()""" return expression.fol() class DiscourseTester(object): """ Check properties of an ongoing discourse. """ def __init__(self, input, reading_command=None, background=None): """ Initialize a ``DiscourseTester``. :param input: the discourse sentences :type input: list of str :param background: Formulas which express background assumptions :type background: list(Expression) """ self._input = input self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(input)]) self._models = None self._readings = {} self._reading_command = (reading_command if reading_command else CfgReadingCommand()) self._threads = {} self._filtered_threads = {} if background is not None: from nltk.sem.logic import Expression for e in background: assert isinstance(e, Expression) self._background = background else: self._background = [] ############################### # Sentences ############################### def sentences(self): """ Display the list of sentences in the current discourse. """ for id in sorted(self._sentences): print("%s: %s" % (id, self._sentences[id])) def add_sentence(self, sentence, informchk=False, consistchk=False,): """ Add a sentence to the current discourse. Updates ``self._input`` and ``self._sentences``. :param sentence: An input sentence :type sentence: str :param informchk: if ``True``, check that the result of adding the sentence is thread-informative. Updates ``self._readings``. :param consistchk: if ``True``, check that the result of adding the sentence is thread-consistent. Updates ``self._readings``. """ # check whether the new sentence is informative (i.e. not entailed by the previous discourse) if informchk: self.readings(verbose=False) for tid in sorted(self._threads): assumptions = [reading for (rid, reading) in self.expand_threads(tid)] assumptions += self._background for sent_reading in self._get_readings(sentence): tp = Prover9Command(goal=sent_reading, assumptions=assumptions) if tp.prove(): print("Sentence '%s' under reading '%s':" % (sentence, str(sent_reading))) print("Not informative relative to thread '%s'" % tid) self._input.append(sentence) self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)]) # check whether adding the new sentence to the discourse preserves consistency (i.e. a model can be found for the combined set of # of assumptions if consistchk: self.readings(verbose=False) self.models(show=False) def retract_sentence(self, sentence, verbose=True): """ Remove a sentence from the current discourse. Updates ``self._input``, ``self._sentences`` and ``self._readings``. :param sentence: An input sentence :type sentence: str :param verbose: If ``True``, report on the updated list of sentences. """ try: self._input.remove(sentence) except ValueError: print("Retraction failed. The sentence '%s' is not part of the current discourse:" % sentence) self.sentences() return None self._sentences = dict([('s%s' % i, sent) for i, sent in enumerate(self._input)]) self.readings(verbose=False) if verbose: print("Current sentences are ") self.sentences() def grammar(self): """ Print out the grammar in use for parsing input sentences """ show_cfg(self._reading_command._gramfile) ############################### # Readings and Threads ############################### def _get_readings(self, sentence): """ Build a list of semantic readings for a sentence. :rtype: list(Expression) """ return self._reading_command.parse_to_readings(sentence) def _construct_readings(self): """ Use ``self._sentences`` to construct a value for ``self._readings``. """ # re-initialize self._readings in case we have retracted a sentence self._readings = {} for sid in sorted(self._sentences): sentence = self._sentences[sid] readings = self._get_readings(sentence) self._readings[sid] = dict([("%s-r%s" % (sid, rid), reading.simplify()) for rid, reading in enumerate(sorted(readings, key=str))]) def _construct_threads(self): """ Use ``self._readings`` to construct a value for ``self._threads`` and use the model builder to construct a value for ``self._filtered_threads`` """ thread_list = [[]] for sid in sorted(self._readings): thread_list = self.multiply(thread_list, sorted(self._readings[sid])) self._threads = dict([("d%s" % tid, thread) for tid, thread in enumerate(thread_list)]) # re-initialize the filtered threads self._filtered_threads = {} # keep the same ids, but only include threads which get models consistency_checked = self._check_consistency(self._threads) for (tid, thread) in self._threads.items(): if (tid, True) in consistency_checked: self._filtered_threads[tid] = thread def _show_readings(self, sentence=None): """ Print out the readings for the discourse (or a single sentence). """ if sentence is not None: print("The sentence '%s' has these readings:" % sentence) for r in [str(reading) for reading in (self._get_readings(sentence))]: print(" %s" % r) else: for sid in sorted(self._readings): print() print('%s readings:' % sid) print() #'-' * 30 for rid in sorted(self._readings[sid]): lf = self._readings[sid][rid] print("%s: %s" % (rid, lf.normalize())) def _show_threads(self, filter=False, show_thread_readings=False): """ Print out the value of ``self._threads`` or ``self._filtered_hreads`` """ threads = (self._filtered_threads if filter else self._threads) for tid in sorted(threads): if show_thread_readings: readings = [self._readings[rid.split('-')[0]][rid] for rid in self._threads[tid]] try: thread_reading = ": %s" % \ self._reading_command.combine_readings(readings).normalize() except Exception as e: thread_reading = ': INVALID: %s' % e.__class__.__name__ else: thread_reading = '' print("%s:" % tid, self._threads[tid], thread_reading) def readings(self, sentence=None, threaded=False, verbose=True, filter=False, show_thread_readings=False): """ Construct and show the readings of the discourse (or of a single sentence). :param sentence: test just this sentence :type sentence: str :param threaded: if ``True``, print out each thread ID and the corresponding thread. :param filter: if ``True``, only print out consistent thread IDs and threads. """ self._construct_readings() self._construct_threads() # if we are filtering or showing thread readings, show threads if filter or show_thread_readings: threaded = True if verbose: if not threaded: self._show_readings(sentence=sentence) else: self._show_threads(filter=filter, show_thread_readings=show_thread_readings) def expand_threads(self, thread_id, threads=None): """ Given a thread ID, find the list of ``logic.Expression`` objects corresponding to the reading IDs in that thread. :param thread_id: thread ID :type thread_id: str :param threads: a mapping from thread IDs to lists of reading IDs :type threads: dict :return: A list of pairs ``(rid, reading)`` where reading is the ``logic.Expression`` associated with a reading ID :rtype: list of tuple """ if threads is None: threads = self._threads return [(rid, self._readings[sid][rid]) for rid in threads[thread_id] for sid in rid.split('-')[:1]] ############################### # Models and Background ############################### def _check_consistency(self, threads, show=False, verbose=False): results = [] for tid in sorted(threads): assumptions = [reading for (rid, reading) in self.expand_threads(tid, threads=threads)] assumptions = list(map(self._reading_command.to_fol, self._reading_command.process_thread(assumptions))) if assumptions: assumptions += self._background # if Mace4 finds a model, it always seems to find it quickly mb = MaceCommand(None, assumptions, max_models=20) modelfound = mb.build_model() else: modelfound = False results.append((tid, modelfound)) if show: spacer(80) print("Model for Discourse Thread %s" % tid) spacer(80) if verbose: for a in assumptions: print(a) spacer(80) if modelfound: print(mb.model(format='cooked')) else: print("No model found!\n") return results def models(self, thread_id=None, show=True, verbose=False): """ Call Mace4 to build a model for each current discourse thread. :param thread_id: thread ID :type thread_id: str :param show: If ``True``, display the model that has been found. """ self._construct_readings() self._construct_threads() threads = ({thread_id: self._threads[thread_id]} if thread_id else self._threads) for (tid, modelfound) in self._check_consistency(threads, show=show, verbose=verbose): idlist = [rid for rid in threads[tid]] if not modelfound: print("Inconsistent discourse: %s %s:" % (tid, idlist)) for rid, reading in self.expand_threads(tid): print(" %s: %s" % (rid, reading.normalize())) print() else: print("Consistent discourse: %s %s:" % (tid, idlist)) for rid, reading in self.expand_threads(tid): print(" %s: %s" % (rid, reading.normalize())) print() def add_background(self, background, verbose=False): """ Add a list of background assumptions for reasoning about the discourse. When called, this method also updates the discourse model's set of readings and threads. :param background: Formulas which contain background information :type background: list(Expression) """ from nltk.sem.logic import Expression for (count, e) in enumerate(background): assert isinstance(e, Expression) if verbose: print("Adding assumption %s to background" % count) self._background.append(e) #update the state self._construct_readings() self._construct_threads() def background(self): """ Show the current background assumptions. """ for e in self._background: print(str(e)) ############################### # Misc ############################### @staticmethod def multiply(discourse, readings): """ Multiply every thread in ``discourse`` by every reading in ``readings``. Given discourse = [['A'], ['B']], readings = ['a', 'b', 'c'] , returns [['A', 'a'], ['A', 'b'], ['A', 'c'], ['B', 'a'], ['B', 'b'], ['B', 'c']] :param discourse: the current list of readings :type discourse: list of lists :param readings: an additional list of readings :type readings: list(Expression) :rtype: A list of lists """ result = [] for sublist in discourse: for r in readings: new = [] new += sublist new.append(r) result.append(new) return result #multiply = DiscourseTester.multiply #L1 = [['A'], ['B']] #L2 = ['a', 'b', 'c'] #print multiply(L1,L2) def load_fol(s): """ Temporarily duplicated from ``nltk.sem.util``. Convert a file of first order formulas into a list of ``Expression`` objects. :param s: the contents of the file :type s: str :return: a list of parsed formulas. :rtype: list(Expression) """ statements = [] for linenum, line in enumerate(s.splitlines()): line = line.strip() if line.startswith('#') or line=='': continue try: statements.append(Expression.fromstring(line)) except Exception: raise ValueError('Unable to parse line %s: %s' % (linenum, line)) return statements ############################### # Demo ############################### def discourse_demo(reading_command=None): """ Illustrate the various methods of ``DiscourseTester`` """ dt = DiscourseTester(['A boxer walks', 'Every boxer chases a girl'], reading_command) dt.models() print() #dt.grammar() print() dt.sentences() print() dt.readings() print() dt.readings(threaded=True) print() dt.models('d1') dt.add_sentence('John is a boxer') print() dt.sentences() print() dt.readings(threaded=True) print() dt = DiscourseTester(['A student dances', 'Every student is a person'], reading_command) print() dt.add_sentence('No person dances', consistchk=True) print() dt.readings() print() dt.retract_sentence('No person dances', verbose=True) print() dt.models() print() dt.readings('A person dances') print() dt.add_sentence('A person dances', informchk=True) dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks'], reading_command) dt.readings(filter=True) import nltk.data background_file = os.path.join('grammars', 'book_grammars', 'background.fol') background = nltk.data.load(background_file) print() dt.add_background(background, verbose=False) dt.background() print() dt.readings(filter=True) print() dt.models() def drt_discourse_demo(reading_command=None): """ Illustrate the various methods of ``DiscourseTester`` """ dt = DiscourseTester(['every dog chases a boy', 'he runs'], reading_command) dt.models() print() dt.sentences() print() dt.readings() print() dt.readings(show_thread_readings=True) print() dt.readings(filter=True, show_thread_readings=True) def spacer(num=30): print('-' * num) def demo(): discourse_demo() tagger = RegexpTagger( [('^(chases|runs)$', 'VB'), ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'), ('^(dog|boy)$', 'NN'), ('^(he)$', 'PRP') ]) depparser = MaltParser(tagger=tagger) drt_discourse_demo(DrtGlueReadingCommand(remove_duplicates=False, depparser=depparser)) if __name__ == '__main__': demo() nltk-3.1/nltk/inference/mace.py0000644000076500000240000002613612574600335016256 0ustar sbstaff00000000000000# Natural Language Toolkit: Interface to the Mace4 Model Builder # # Author: Dan Garrette # Ewan Klein # URL: # For license information, see LICENSE.TXT """ A model builder that makes use of the external 'Mace4' package. """ from __future__ import print_function import os import tempfile from nltk.sem.logic import is_indvar from nltk.sem import Valuation, Expression from nltk.inference.api import ModelBuilder, BaseModelBuilderCommand from nltk.inference.prover9 import Prover9CommandParent, Prover9Parent class MaceCommand(Prover9CommandParent, BaseModelBuilderCommand): """ A ``MaceCommand`` specific to the ``Mace`` model builder. It contains a print_assumptions() method that is used to print the list of assumptions in multiple formats. """ _interpformat_bin = None def __init__(self, goal=None, assumptions=None, max_models=500, model_builder=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) :param max_models: The maximum number of models that Mace will try before simply returning false. (Use 0 for no maximum.) :type max_models: int """ if model_builder is not None: assert isinstance(model_builder, Mace) else: model_builder = Mace(max_models) BaseModelBuilderCommand.__init__(self, model_builder, goal, assumptions) @property def valuation(mbc): return mbc.model('valuation') def _convert2val(self, valuation_str): """ Transform the output file into an NLTK-style Valuation. :return: A model if one is generated; None otherwise. :rtype: sem.Valuation """ valuation_standard_format = self._transform_output(valuation_str, 'standard') val = [] for line in valuation_standard_format.splitlines(False): l = line.strip() if l.startswith('interpretation'): # find the number of entities in the model num_entities = int(l[l.index('(')+1:l.index(',')].strip()) elif l.startswith('function') and l.find('_') == -1: # replace the integer identifier with a corresponding alphabetic character name = l[l.index('(')+1:l.index(',')].strip() if is_indvar(name): name = name.upper() value = int(l[l.index('[')+1:l.index(']')].strip()) val.append((name, MaceCommand._make_model_var(value))) elif l.startswith('relation'): l = l[l.index('(')+1:] if '(' in l: #relation is not nullary name = l[:l.index('(')].strip() values = [int(v.strip()) for v in l[l.index('[')+1:l.index(']')].split(',')] val.append((name, MaceCommand._make_relation_set(num_entities, values))) else: #relation is nullary name = l[:l.index(',')].strip() value = int(l[l.index('[')+1:l.index(']')].strip()) val.append((name, value == 1)) return Valuation(val) @staticmethod def _make_relation_set(num_entities, values): """ Convert a Mace4-style relation table into a dictionary. :param num_entities: the number of entities in the model; determines the row length in the table. :type num_entities: int :param values: a list of 1's and 0's that represent whether a relation holds in a Mace4 model. :type values: list of int """ r = set() for position in [pos for (pos,v) in enumerate(values) if v == 1]: r.add(tuple(MaceCommand._make_relation_tuple(position, values, num_entities))) return r @staticmethod def _make_relation_tuple(position, values, num_entities): if len(values) == 1: return [] else: sublist_size = len(values) // num_entities sublist_start = position // sublist_size sublist_position = int(position % sublist_size) sublist = values[sublist_start*sublist_size:(sublist_start+1)*sublist_size] return [MaceCommand._make_model_var(sublist_start)] + \ MaceCommand._make_relation_tuple(sublist_position, sublist, num_entities) @staticmethod def _make_model_var(value): """ Pick an alphabetic character as identifier for an entity in the model. :param value: where to index into the list of characters :type value: int """ letter = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n', 'o','p','q','r','s','t','u','v','w','x','y','z'][value] num = value // 26 return (letter + str(num) if num > 0 else letter) def _decorate_model(self, valuation_str, format): """ Print out a Mace4 model using any Mace4 ``interpformat`` format. See http://www.cs.unm.edu/~mccune/mace4/manual/ for details. :param valuation_str: str with the model builder's output :param format: str indicating the format for displaying models. Defaults to 'standard' format. :return: str """ if not format: return valuation_str elif format == 'valuation': return self._convert2val(valuation_str) else: return self._transform_output(valuation_str, format) def _transform_output(self, valuation_str, format): """ Transform the output file into any Mace4 ``interpformat`` format. :param format: Output format for displaying models. :type format: str """ if format in ['standard', 'standard2', 'portable', 'tabular', 'raw', 'cooked', 'xml', 'tex']: return self._call_interpformat(valuation_str, [format])[0] else: raise LookupError("The specified format does not exist") def _call_interpformat(self, input_str, args=[], verbose=False): """ Call the ``interpformat`` binary with the given input. :param input_str: A string whose contents are used as stdin. :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if self._interpformat_bin is None: self._interpformat_bin = self._modelbuilder._find_binary( 'interpformat', verbose) return self._modelbuilder._call(input_str, self._interpformat_bin, args, verbose) class Mace(Prover9Parent, ModelBuilder): _mace4_bin = None def __init__(self, end_size=500): self._end_size = end_size """The maximum model size that Mace will try before simply returning false. (Use -1 for no maximum.)""" def _build_model(self, goal=None, assumptions=None, verbose=False): """ Use Mace4 to build a first order model. :return: ``True`` if a model was found (i.e. Mace returns value of 0), else ``False`` """ if not assumptions: assumptions = [] stdout, returncode = self._call_mace4(self.prover9_input(goal, assumptions), verbose=verbose) return (returncode == 0, stdout) def _call_mace4(self, input_str, args=[], verbose=False): """ Call the ``mace4`` binary with the given input. :param input_str: A string whose contents are used as stdin. :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if self._mace4_bin is None: self._mace4_bin = self._find_binary('mace4', verbose) updated_input_str = '' if self._end_size > 0: updated_input_str += 'assign(end_size, %d).\n\n' % self._end_size updated_input_str += input_str return self._call(updated_input_str, self._mace4_bin, args, verbose) def spacer(num=30): print('-' * num) def decode_result(found): """ Decode the result of model_found() :param found: The output of model_found() :type found: bool """ return {True: 'Countermodel found', False: 'No countermodel found', None: 'None'}[found] def test_model_found(arguments): """ Try some proofs and exhibit the results. """ for (goal, assumptions) in arguments: g = Expression.fromstring(goal) alist = [lp.parse(a) for a in assumptions] m = MaceCommand(g, assumptions=alist, max_models=50) found = m.build_model() for a in alist: print(' %s' % a) print('|- %s: %s\n' % (g, decode_result(found))) def test_build_model(arguments): """ Try to build a ``nltk.sem.Valuation``. """ g = Expression.fromstring('all x.man(x)') alist = [Expression.fromstring(a) for a in ['man(John)', 'man(Socrates)', 'man(Bill)', 'some x.(-(x = John) & man(x) & sees(John,x))', 'some x.(-(x = Bill) & man(x))', 'all x.some y.(man(x) -> gives(Socrates,x,y))']] m = MaceCommand(g, assumptions=alist) m.build_model() spacer() print("Assumptions and Goal") spacer() for a in alist: print(' %s' % a) print('|- %s: %s\n' % (g, decode_result(m.build_model()))) spacer() #print m.model('standard') #print m.model('cooked') print("Valuation") spacer() print(m.valuation, '\n') def test_transform_output(argument_pair): """ Transform the model into various Mace4 ``interpformat`` formats. """ g = Expression.fromstring(argument_pair[0]) alist = [lp.parse(a) for a in argument_pair[1]] m = MaceCommand(g, assumptions=alist) m.build_model() for a in alist: print(' %s' % a) print('|- %s: %s\n' % (g, m.build_model())) for format in ['standard', 'portable', 'xml', 'cooked']: spacer() print("Using '%s' format" % format) spacer() print(m.model(format=format)) def test_make_relation_set(): print(MaceCommand._make_relation_set(num_entities=3, values=[1,0,1]) == set([('c',), ('a',)])) print(MaceCommand._make_relation_set(num_entities=3, values=[0,0,0,0,0,0,1,0,0]) == set([('c', 'a')])) print(MaceCommand._make_relation_set(num_entities=2, values=[0,0,1,0,0,0,1,0]) == set([('a', 'b', 'a'), ('b', 'b', 'a')])) arguments = [ ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']), ('(not mortal(Socrates))', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']) ] def demo(): test_model_found(arguments) test_build_model(arguments) test_transform_output(arguments[1]) if __name__ == '__main__': demo() nltk-3.1/nltk/inference/nonmonotonic.py0000644000076500000240000004425712607224144020072 0ustar sbstaff00000000000000# Natural Language Toolkit: Nonmonotonic Reasoning # # Author: Daniel H. Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT """ A module to perform nonmonotonic reasoning. The ideas and demonstrations in this module are based on "Logical Foundations of Artificial Intelligence" by Michael R. Genesereth and Nils J. Nilsson. """ from __future__ import print_function, unicode_literals from nltk.inference.prover9 import Prover9, Prover9Command from collections import defaultdict from functools import reduce from nltk.sem.logic import (VariableExpression, EqualityExpression, ApplicationExpression, Expression, AbstractVariableExpression, AllExpression, BooleanExpression, NegatedExpression, ExistsExpression, Variable, ImpExpression, AndExpression, unique_variable, operator) from nltk.inference.api import Prover, ProverCommandDecorator from nltk.compat import python_2_unicode_compatible class ProverParseError(Exception): pass def get_domain(goal, assumptions): if goal is None: all_expressions = assumptions else: all_expressions = assumptions + [-goal] return reduce(operator.or_, (a.constants() for a in all_expressions), set()) class ClosedDomainProver(ProverCommandDecorator): """ This is a prover decorator that adds domain closure assumptions before proving. """ def assumptions(self): assumptions = [a for a in self._command.assumptions()] goal = self._command.goal() domain = get_domain(goal, assumptions) return [self.replace_quants(ex, domain) for ex in assumptions] def goal(self): goal = self._command.goal() domain = get_domain(goal, self._command.assumptions()) return self.replace_quants(goal, domain) def replace_quants(self, ex, domain): """ Apply the closed domain assumption to the expression - Domain = union([e.free()|e.constants() for e in all_expressions]) - translate "exists x.P" to "(z=d1 | z=d2 | ... ) & P.replace(x,z)" OR "P.replace(x, d1) | P.replace(x, d2) | ..." - translate "all x.P" to "P.replace(x, d1) & P.replace(x, d2) & ..." :param ex: ``Expression`` :param domain: set of {Variable}s :return: ``Expression`` """ if isinstance(ex, AllExpression): conjuncts = [ex.term.replace(ex.variable, VariableExpression(d)) for d in domain] conjuncts = [self.replace_quants(c, domain) for c in conjuncts] return reduce(lambda x,y: x&y, conjuncts) elif isinstance(ex, BooleanExpression): return ex.__class__(self.replace_quants(ex.first, domain), self.replace_quants(ex.second, domain) ) elif isinstance(ex, NegatedExpression): return -self.replace_quants(ex.term, domain) elif isinstance(ex, ExistsExpression): disjuncts = [ex.term.replace(ex.variable, VariableExpression(d)) for d in domain] disjuncts = [self.replace_quants(d, domain) for d in disjuncts] return reduce(lambda x,y: x|y, disjuncts) else: return ex class UniqueNamesProver(ProverCommandDecorator): """ This is a prover decorator that adds unique names assumptions before proving. """ def assumptions(self): """ - Domain = union([e.free()|e.constants() for e in all_expressions]) - if "d1 = d2" cannot be proven from the premises, then add "d1 != d2" """ assumptions = self._command.assumptions() domain = list(get_domain(self._command.goal(), assumptions)) #build a dictionary of obvious equalities eq_sets = SetHolder() for a in assumptions: if isinstance(a, EqualityExpression): av = a.first.variable bv = a.second.variable #put 'a' and 'b' in the same set eq_sets[av].add(bv) new_assumptions = [] for i,a in enumerate(domain): for b in domain[i+1:]: #if a and b are not already in the same equality set if b not in eq_sets[a]: newEqEx = EqualityExpression(VariableExpression(a), VariableExpression(b)) if Prover9().prove(newEqEx, assumptions): #we can prove that the names are the same entity. #remember that they are equal so we don't re-check. eq_sets[a].add(b) else: #we can't prove it, so assume unique names new_assumptions.append(-newEqEx) return assumptions + new_assumptions class SetHolder(list): """ A list of sets of Variables. """ def __getitem__(self, item): """ :param item: ``Variable`` :return: the set containing 'item' """ assert isinstance(item, Variable) for s in self: if item in s: return s #item is not found in any existing set. so create a new set new = set([item]) self.append(new) return new class ClosedWorldProver(ProverCommandDecorator): """ This is a prover decorator that completes predicates before proving. If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion of "P". If the assumptions contain "all x.(ostrich(x) -> bird(x))", then "all x.(bird(x) -> ostrich(x))" is the completion of "bird". If the assumptions don't contain anything that are "P", then "all x.-P(x)" is the completion of "P". walk(Socrates) Socrates != Bill + all x.(walk(x) -> (x=Socrates)) ---------------- -walk(Bill) see(Socrates, John) see(John, Mary) Socrates != John John != Mary + all x.all y.(see(x,y) -> ((x=Socrates & y=John) | (x=John & y=Mary))) ---------------- -see(Socrates, Mary) all x.(ostrich(x) -> bird(x)) bird(Tweety) -ostrich(Sam) Sam != Tweety + all x.(bird(x) -> (ostrich(x) | x=Tweety)) + all x.-ostrich(x) ------------------- -bird(Sam) """ def assumptions(self): assumptions = self._command.assumptions() predicates = self._make_predicate_dict(assumptions) new_assumptions = [] for p in predicates: predHolder = predicates[p] new_sig = self._make_unique_signature(predHolder) new_sig_exs = [VariableExpression(v) for v in new_sig] disjuncts = [] #Turn the signatures into disjuncts for sig in predHolder.signatures: equality_exs = [] for v1,v2 in zip(new_sig_exs, sig): equality_exs.append(EqualityExpression(v1,v2)) disjuncts.append(reduce(lambda x,y: x&y, equality_exs)) #Turn the properties into disjuncts for prop in predHolder.properties: #replace variables from the signature with new sig variables bindings = {} for v1,v2 in zip(new_sig_exs, prop[0]): bindings[v2] = v1 disjuncts.append(prop[1].substitute_bindings(bindings)) #make the assumption if disjuncts: #disjuncts exist, so make an implication antecedent = self._make_antecedent(p, new_sig) consequent = reduce(lambda x,y: x|y, disjuncts) accum = ImpExpression(antecedent, consequent) else: #nothing has property 'p' accum = NegatedExpression(self._make_antecedent(p, new_sig)) #quantify the implication for new_sig_var in new_sig[::-1]: accum = AllExpression(new_sig_var, accum) new_assumptions.append(accum) return assumptions + new_assumptions def _make_unique_signature(self, predHolder): """ This method figures out how many arguments the predicate takes and returns a tuple containing that number of unique variables. """ return tuple(unique_variable() for i in range(predHolder.signature_len)) def _make_antecedent(self, predicate, signature): """ Return an application expression with 'predicate' as the predicate and 'signature' as the list of arguments. """ antecedent = predicate for v in signature: antecedent = antecedent(VariableExpression(v)) return antecedent def _make_predicate_dict(self, assumptions): """ Create a dictionary of predicates from the assumptions. :param assumptions: a list of ``Expression``s :return: dict mapping ``AbstractVariableExpression`` to ``PredHolder`` """ predicates = defaultdict(PredHolder) for a in assumptions: self._map_predicates(a, predicates) return predicates def _map_predicates(self, expression, predDict): if isinstance(expression, ApplicationExpression): func, args = expression.uncurry() if isinstance(func, AbstractVariableExpression): predDict[func].append_sig(tuple(args)) elif isinstance(expression, AndExpression): self._map_predicates(expression.first, predDict) self._map_predicates(expression.second, predDict) elif isinstance(expression, AllExpression): #collect all the universally quantified variables sig = [expression.variable] term = expression.term while isinstance(term, AllExpression): sig.append(term.variable) term = term.term if isinstance(term, ImpExpression): if isinstance(term.first, ApplicationExpression) and \ isinstance(term.second, ApplicationExpression): func1, args1 = term.first.uncurry() func2, args2 = term.second.uncurry() if isinstance(func1, AbstractVariableExpression) and \ isinstance(func2, AbstractVariableExpression) and \ sig == [v.variable for v in args1] and \ sig == [v.variable for v in args2]: predDict[func2].append_prop((tuple(sig), term.first)) predDict[func1].validate_sig_len(sig) @python_2_unicode_compatible class PredHolder(object): """ This class will be used by a dictionary that will store information about predicates to be used by the ``ClosedWorldProver``. The 'signatures' property is a list of tuples defining signatures for which the predicate is true. For instance, 'see(john, mary)' would be result in the signature '(john,mary)' for 'see'. The second element of the pair is a list of pairs such that the first element of the pair is a tuple of variables and the second element is an expression of those variables that makes the predicate true. For instance, 'all x.all y.(see(x,y) -> know(x,y))' would result in "((x,y),('see(x,y)'))" for 'know'. """ def __init__(self): self.signatures = [] self.properties = [] self.signature_len = None def append_sig(self, new_sig): self.validate_sig_len(new_sig) self.signatures.append(new_sig) def append_prop(self, new_prop): self.validate_sig_len(new_prop[0]) self.properties.append(new_prop) def validate_sig_len(self, new_sig): if self.signature_len is None: self.signature_len = len(new_sig) elif self.signature_len != len(new_sig): raise Exception("Signature lengths do not match") def __str__(self): return '(%s,%s,%s)' % (self.signatures, self.properties, self.signature_len) def __repr__(self): return "%s" % self def closed_domain_demo(): lexpr = Expression.fromstring p1 = lexpr(r'exists x.walk(x)') p2 = lexpr(r'man(Socrates)') c = lexpr(r'walk(Socrates)') prover = Prover9Command(c, [p1,p2]) print(prover.prove()) cdp = ClosedDomainProver(prover) print('assumptions:') for a in cdp.assumptions(): print(' ', a) print('goal:', cdp.goal()) print(cdp.prove()) p1 = lexpr(r'exists x.walk(x)') p2 = lexpr(r'man(Socrates)') p3 = lexpr(r'-walk(Bill)') c = lexpr(r'walk(Socrates)') prover = Prover9Command(c, [p1,p2,p3]) print(prover.prove()) cdp = ClosedDomainProver(prover) print('assumptions:') for a in cdp.assumptions(): print(' ', a) print('goal:', cdp.goal()) print(cdp.prove()) p1 = lexpr(r'exists x.walk(x)') p2 = lexpr(r'man(Socrates)') p3 = lexpr(r'-walk(Bill)') c = lexpr(r'walk(Socrates)') prover = Prover9Command(c, [p1,p2,p3]) print(prover.prove()) cdp = ClosedDomainProver(prover) print('assumptions:') for a in cdp.assumptions(): print(' ', a) print('goal:', cdp.goal()) print(cdp.prove()) p1 = lexpr(r'walk(Socrates)') p2 = lexpr(r'walk(Bill)') c = lexpr(r'all x.walk(x)') prover = Prover9Command(c, [p1,p2]) print(prover.prove()) cdp = ClosedDomainProver(prover) print('assumptions:') for a in cdp.assumptions(): print(' ', a) print('goal:', cdp.goal()) print(cdp.prove()) p1 = lexpr(r'girl(mary)') p2 = lexpr(r'dog(rover)') p3 = lexpr(r'all x.(girl(x) -> -dog(x))') p4 = lexpr(r'all x.(dog(x) -> -girl(x))') p5 = lexpr(r'chase(mary, rover)') c = lexpr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))') prover = Prover9Command(c, [p1,p2,p3,p4,p5]) print(prover.prove()) cdp = ClosedDomainProver(prover) print('assumptions:') for a in cdp.assumptions(): print(' ', a) print('goal:', cdp.goal()) print(cdp.prove()) def unique_names_demo(): lexpr = Expression.fromstring p1 = lexpr(r'man(Socrates)') p2 = lexpr(r'man(Bill)') c = lexpr(r'exists x.exists y.(x != y)') prover = Prover9Command(c, [p1,p2]) print(prover.prove()) unp = UniqueNamesProver(prover) print('assumptions:') for a in unp.assumptions(): print(' ', a) print('goal:', unp.goal()) print(unp.prove()) p1 = lexpr(r'all x.(walk(x) -> (x = Socrates))') p2 = lexpr(r'Bill = William') p3 = lexpr(r'Bill = Billy') c = lexpr(r'-walk(William)') prover = Prover9Command(c, [p1,p2,p3]) print(prover.prove()) unp = UniqueNamesProver(prover) print('assumptions:') for a in unp.assumptions(): print(' ', a) print('goal:', unp.goal()) print(unp.prove()) def closed_world_demo(): lexpr = Expression.fromstring p1 = lexpr(r'walk(Socrates)') p2 = lexpr(r'(Socrates != Bill)') c = lexpr(r'-walk(Bill)') prover = Prover9Command(c, [p1,p2]) print(prover.prove()) cwp = ClosedWorldProver(prover) print('assumptions:') for a in cwp.assumptions(): print(' ', a) print('goal:', cwp.goal()) print(cwp.prove()) p1 = lexpr(r'see(Socrates, John)') p2 = lexpr(r'see(John, Mary)') p3 = lexpr(r'(Socrates != John)') p4 = lexpr(r'(John != Mary)') c = lexpr(r'-see(Socrates, Mary)') prover = Prover9Command(c, [p1,p2,p3,p4]) print(prover.prove()) cwp = ClosedWorldProver(prover) print('assumptions:') for a in cwp.assumptions(): print(' ', a) print('goal:', cwp.goal()) print(cwp.prove()) p1 = lexpr(r'all x.(ostrich(x) -> bird(x))') p2 = lexpr(r'bird(Tweety)') p3 = lexpr(r'-ostrich(Sam)') p4 = lexpr(r'Sam != Tweety') c = lexpr(r'-bird(Sam)') prover = Prover9Command(c, [p1,p2,p3,p4]) print(prover.prove()) cwp = ClosedWorldProver(prover) print('assumptions:') for a in cwp.assumptions(): print(' ', a) print('goal:', cwp.goal()) print(cwp.prove()) def combination_prover_demo(): lexpr = Expression.fromstring p1 = lexpr(r'see(Socrates, John)') p2 = lexpr(r'see(John, Mary)') c = lexpr(r'-see(Socrates, Mary)') prover = Prover9Command(c, [p1,p2]) print(prover.prove()) command = ClosedDomainProver( UniqueNamesProver( ClosedWorldProver(prover))) for a in command.assumptions(): print(a) print(command.prove()) def default_reasoning_demo(): lexpr = Expression.fromstring premises = [] #define taxonomy premises.append(lexpr(r'all x.(elephant(x) -> animal(x))')) premises.append(lexpr(r'all x.(bird(x) -> animal(x))')) premises.append(lexpr(r'all x.(dove(x) -> bird(x))')) premises.append(lexpr(r'all x.(ostrich(x) -> bird(x))')) premises.append(lexpr(r'all x.(flying_ostrich(x) -> ostrich(x))')) #default properties premises.append(lexpr(r'all x.((animal(x) & -Ab1(x)) -> -fly(x))')) #normal animals don't fly premises.append(lexpr(r'all x.((bird(x) & -Ab2(x)) -> fly(x))')) #normal birds fly premises.append(lexpr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')) #normal ostriches don't fly #specify abnormal entities premises.append(lexpr(r'all x.(bird(x) -> Ab1(x))')) #flight premises.append(lexpr(r'all x.(ostrich(x) -> Ab2(x))')) #non-flying bird premises.append(lexpr(r'all x.(flying_ostrich(x) -> Ab3(x))')) #flying ostrich #define entities premises.append(lexpr(r'elephant(E)')) premises.append(lexpr(r'dove(D)')) premises.append(lexpr(r'ostrich(O)')) #print the assumptions prover = Prover9Command(None, premises) command = UniqueNamesProver(ClosedWorldProver(prover)) for a in command.assumptions(): print(a) print_proof('-fly(E)', premises) print_proof('fly(D)', premises) print_proof('-fly(O)', premises) def print_proof(goal, premises): lexpr = Expression.fromstring prover = Prover9Command(lexpr(goal), premises) command = UniqueNamesProver(ClosedWorldProver(prover)) print(goal, prover.prove(), command.prove()) def demo(): closed_domain_demo() unique_names_demo() closed_world_demo() combination_prover_demo() default_reasoning_demo() if __name__ == '__main__': demo() nltk-3.1/nltk/inference/prover9.py0000644000076500000240000003621512607224144016753 0ustar sbstaff00000000000000# Natural Language Toolkit: Interface to the Prover9 Theorem Prover # # Copyright (C) 2001-2015 NLTK Project # Author: Dan Garrette # Ewan Klein # # URL: # For license information, see LICENSE.TXT """ A theorem prover that makes use of the external 'Prover9' package. """ from __future__ import print_function import os import subprocess import nltk from nltk.sem.logic import Expression, ExistsExpression, AllExpression, \ NegatedExpression, AndExpression, IffExpression, OrExpression, \ EqualityExpression, ImpExpression from nltk.inference.api import BaseProverCommand, Prover # # Following is not yet used. Return code for 2 actually realized as 512. # p9_return_codes = { 0: True, 1: "(FATAL)", #A fatal error occurred (user's syntax error). 2: False, # (SOS_EMPTY) Prover9 ran out of things to do # (sos list exhausted). 3: "(MAX_MEGS)", # The max_megs (memory limit) parameter was exceeded. 4: "(MAX_SECONDS)", # The max_seconds parameter was exceeded. 5: "(MAX_GIVEN)", # The max_given parameter was exceeded. 6: "(MAX_KEPT)", # The max_kept parameter was exceeded. 7: "(ACTION)", # A Prover9 action terminated the search. 101: "(SIGSEGV)", # Prover9 crashed, most probably due to a bug. } class Prover9CommandParent(object): """ A common base class used by both ``Prover9Command`` and ``MaceCommand``, which is responsible for maintaining a goal and a set of assumptions, and generating prover9-style input files from them. """ def print_assumptions(self, output_format='nltk'): """ Print the list of the current assumptions. """ if output_format.lower() == 'nltk': for a in self.assumptions(): print(a) elif output_format.lower() == 'prover9': for a in convert_to_prover9(self.assumptions()): print(a) else: raise NameError("Unrecognized value for 'output_format': %s" % output_format) class Prover9Command(Prover9CommandParent, BaseProverCommand): """ A ``ProverCommand`` specific to the ``Prover9`` prover. It contains the a print_assumptions() method that is used to print the list of assumptions in multiple formats. """ def __init__(self, goal=None, assumptions=None, timeout=60, prover=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) :param timeout: number of seconds before timeout; set to 0 for no timeout. :type timeout: int :param prover: a prover. If not set, one will be created. :type prover: Prover9 """ if not assumptions: assumptions = [] if prover is not None: assert isinstance(prover, Prover9) else: prover = Prover9(timeout) BaseProverCommand.__init__(self, prover, goal, assumptions) def decorate_proof(self, proof_string, simplify=True): """ :see BaseProverCommand.decorate_proof() """ if simplify: return self._prover._call_prooftrans(proof_string, ['striplabels'])[0].rstrip() else: return proof_string.rstrip() class Prover9Parent(object): """ A common class extended by both ``Prover9`` and ``Mace ``. It contains the functionality required to convert NLTK-style expressions into Prover9-style expressions. """ _binary_location = None def config_prover9(self, binary_location, verbose=False): if binary_location is None: self._binary_location = None self._prover9_bin = None else: name = 'prover9' self._prover9_bin = nltk.internals.find_binary( name, path_to_bin=binary_location, env_vars=['PROVER9'], url='http://www.cs.unm.edu/~mccune/prover9/', binary_names=[name, name + '.exe'], verbose=verbose) self._binary_location = self._prover9_bin.rsplit(os.path.sep, 1) def prover9_input(self, goal, assumptions): """ :return: The input string that should be provided to the prover9 binary. This string is formed based on the goal, assumptions, and timeout value of this object. """ s = '' if assumptions: s += 'formulas(assumptions).\n' for p9_assumption in convert_to_prover9(assumptions): s += ' %s.\n' % p9_assumption s += 'end_of_list.\n\n' if goal: s += 'formulas(goals).\n' s += ' %s.\n' % convert_to_prover9(goal) s += 'end_of_list.\n\n' return s def binary_locations(self): """ A list of directories that should be searched for the prover9 executables. This list is used by ``config_prover9`` when searching for the prover9 executables. """ return ['/usr/local/bin/prover9', '/usr/local/bin/prover9/bin', '/usr/local/bin', '/usr/bin', '/usr/local/prover9', '/usr/local/share/prover9'] def _find_binary(self, name, verbose=False): binary_locations = self.binary_locations() if self._binary_location is not None: binary_locations += [self._binary_location] return nltk.internals.find_binary(name, searchpath=binary_locations, env_vars=['PROVER9'], url='http://www.cs.unm.edu/~mccune/prover9/', binary_names=[name, name + '.exe'], verbose=verbose) def _call(self, input_str, binary, args=[], verbose=False): """ Call the binary with the given input. :param input_str: A string whose contents are used as stdin. :param binary: The location of the binary to call :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if verbose: print('Calling:', binary) print('Args:', args) print('Input:\n', input_str, '\n') # Call prover9 via a subprocess cmd = [binary] + args try: input_str = input_str.encode("utf8") except AttributeError: pass p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, stdin=subprocess.PIPE) (stdout, stderr) = p.communicate(input=input_str) if verbose: print('Return code:', p.returncode) if stdout: print('stdout:\n', stdout, '\n') if stderr: print('stderr:\n', stderr, '\n') return (stdout.decode("utf-8"), p.returncode) def convert_to_prover9(input): """ Convert a ``logic.Expression`` to Prover9 format. """ if isinstance(input, list): result = [] for s in input: try: result.append(_convert_to_prover9(s.simplify())) except: print('input %s cannot be converted to Prover9 input syntax' % input) raise return result else: try: return _convert_to_prover9(input.simplify()) except: print('input %s cannot be converted to Prover9 input syntax' % input) raise def _convert_to_prover9(expression): """ Convert ``logic.Expression`` to Prover9 formatted string. """ if isinstance(expression, ExistsExpression): return 'exists ' + str(expression.variable) + ' ' + _convert_to_prover9(expression.term) elif isinstance(expression, AllExpression): return 'all ' + str(expression.variable) + ' ' + _convert_to_prover9(expression.term) elif isinstance(expression, NegatedExpression): return '-(' + _convert_to_prover9(expression.term) + ')' elif isinstance(expression, AndExpression): return '(' + _convert_to_prover9(expression.first) + ' & ' + \ _convert_to_prover9(expression.second) + ')' elif isinstance(expression, OrExpression): return '(' + _convert_to_prover9(expression.first) + ' | ' + \ _convert_to_prover9(expression.second) + ')' elif isinstance(expression, ImpExpression): return '(' + _convert_to_prover9(expression.first) + ' -> ' + \ _convert_to_prover9(expression.second) + ')' elif isinstance(expression, IffExpression): return '(' + _convert_to_prover9(expression.first) + ' <-> ' + \ _convert_to_prover9(expression.second) + ')' elif isinstance(expression, EqualityExpression): return '(' + _convert_to_prover9(expression.first) + ' = ' + \ _convert_to_prover9(expression.second) + ')' else: return str(expression) class Prover9(Prover9Parent, Prover): _prover9_bin = None _prooftrans_bin = None def __init__(self, timeout=60): self._timeout = timeout """The timeout value for prover9. If a proof can not be found in this amount of time, then prover9 will return false. (Use 0 for no timeout.)""" def _prove(self, goal=None, assumptions=None, verbose=False): """ Use Prover9 to prove a theorem. :return: A pair whose first element is a boolean indicating if the proof was successful (i.e. returns value of 0) and whose second element is the output of the prover. """ if not assumptions: assumptions = [] stdout, returncode = self._call_prover9(self.prover9_input(goal, assumptions), verbose=verbose) return (returncode == 0, stdout) def prover9_input(self, goal, assumptions): """ :see: Prover9Parent.prover9_input """ s = 'clear(auto_denials).\n' #only one proof required return s + Prover9Parent.prover9_input(self, goal, assumptions) def _call_prover9(self, input_str, args=[], verbose=False): """ Call the ``prover9`` binary with the given input. :param input_str: A string whose contents are used as stdin. :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if self._prover9_bin is None: self._prover9_bin = self._find_binary('prover9', verbose) updated_input_str = '' if self._timeout > 0: updated_input_str += 'assign(max_seconds, %d).\n\n' % self._timeout updated_input_str += input_str stdout, returncode = self._call(updated_input_str, self._prover9_bin, args, verbose) if returncode not in [0,2]: errormsgprefix = '%%ERROR:' if errormsgprefix in stdout: msgstart = stdout.index(errormsgprefix) errormsg = stdout[msgstart:].strip() else: errormsg = None if returncode in [3,4,5,6]: raise Prover9LimitExceededException(returncode, errormsg) else: raise Prover9FatalException(returncode, errormsg) return stdout, returncode def _call_prooftrans(self, input_str, args=[], verbose=False): """ Call the ``prooftrans`` binary with the given input. :param input_str: A string whose contents are used as stdin. :param args: A list of command-line arguments. :return: A tuple (stdout, returncode) :see: ``config_prover9`` """ if self._prooftrans_bin is None: self._prooftrans_bin = self._find_binary('prooftrans', verbose) return self._call(input_str, self._prooftrans_bin, args, verbose) class Prover9Exception(Exception): def __init__(self, returncode, message): msg = p9_return_codes[returncode] if message: msg += '\n%s' % message Exception.__init__(self, msg) class Prover9FatalException(Prover9Exception): pass class Prover9LimitExceededException(Prover9Exception): pass ###################################################################### #{ Tests and Demos ###################################################################### def test_config(): a = Expression.fromstring('(walk(j) & sing(j))') g = Expression.fromstring('walk(j)') p = Prover9Command(g, assumptions=[a]) p._executable_path = None p.prover9_search=[] p.prove() #config_prover9('/usr/local/bin') print(p.prove()) print(p.proof()) def test_convert_to_prover9(expr): """ Test that parsing works OK. """ for t in expr: e = Expression.fromstring(t) print(convert_to_prover9(e)) def test_prove(arguments): """ Try some proofs and exhibit the results. """ for (goal, assumptions) in arguments: g = Expression.fromstring(goal) alist = [Expression.fromstring(a) for a in assumptions] p = Prover9Command(g, assumptions=alist).prove() for a in alist: print(' %s' % a) print('|- %s: %s\n' % (g, p)) arguments = [ ('(man(x) <-> (not (not man(x))))', []), ('(not (man(x) & (not man(x))))', []), ('(man(x) | (not man(x)))', []), ('(man(x) & (not man(x)))', []), ('(man(x) -> man(x))', []), ('(not (man(x) & (not man(x))))', []), ('(man(x) | (not man(x)))', []), ('(man(x) -> man(x))', []), ('(man(x) <-> man(x))', []), ('(not (man(x) <-> (not man(x))))', []), ('mortal(Socrates)', ['all x.(man(x) -> mortal(x))', 'man(Socrates)']), ('((all x.(man(x) -> walks(x)) & man(Socrates)) -> some y.walks(y))', []), ('(all x.man(x) -> all x.man(x))', []), ('some x.all y.sees(x,y)', []), ('some e3.(walk(e3) & subj(e3, mary))', ['some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))']), ('some x e1.(see(e1) & subj(e1, x) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))', ['some e1.(see(e1) & subj(e1, john) & some e2.(pred(e1, e2) & walk(e2) & subj(e2, mary)))']) ] expressions = [r'some x y.sees(x,y)', r'some x.(man(x) & walks(x))', r'\x.(man(x) & walks(x))', r'\x y.sees(x,y)', r'walks(john)', r'\x.big(x, \y.mouse(y))', r'(walks(x) & (runs(x) & (threes(x) & fours(x))))', r'(walks(x) -> runs(x))', r'some x.(PRO(x) & sees(John, x))', r'some x.(man(x) & (not walks(x)))', r'all x.(man(x) -> walks(x))'] def spacer(num=45): print('-' * num) def demo(): print("Testing configuration") spacer() test_config() print() print("Testing conversion to Prover9 format") spacer() test_convert_to_prover9(expressions) print() print("Testing proofs") spacer() test_prove(arguments) if __name__ == '__main__': demo() nltk-3.1/nltk/inference/resolution.py0000755000076500000240000006176112607224144017557 0ustar sbstaff00000000000000# Natural Language Toolkit: First-order Resolution-based Theorem Prover # # Author: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT """ Module for a resolution-based First Order theorem prover. """ from __future__ import print_function, unicode_literals import operator from collections import defaultdict from functools import reduce from nltk.sem import skolemize from nltk.sem.logic import (VariableExpression, EqualityExpression, ApplicationExpression, Expression, NegatedExpression, Variable, AndExpression, unique_variable, OrExpression, is_indvar, IndividualVariableExpression, Expression) from nltk.inference.api import Prover, BaseProverCommand from nltk.compat import python_2_unicode_compatible class ProverParseError(Exception): pass class ResolutionProver(Prover): ANSWER_KEY = 'ANSWER' _assume_false=True def _prove(self, goal=None, assumptions=None, verbose=False): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof :type assumptions: list(sem.Expression) """ if not assumptions: assumptions = [] result = None try: clauses = [] if goal: clauses.extend(clausify(-goal)) for a in assumptions: clauses.extend(clausify(a)) result, clauses = self._attempt_proof(clauses) if verbose: print(ResolutionProverCommand._decorate_clauses(clauses)) except RuntimeError as e: if self._assume_false and str(e).startswith('maximum recursion depth exceeded'): result = False clauses = [] else: if verbose: print(e) else: raise e return (result, clauses) def _attempt_proof(self, clauses): #map indices to lists of indices, to store attempted unifications tried = defaultdict(list) i = 0 while i < len(clauses): if not clauses[i].is_tautology(): #since we try clauses in order, we should start after the last #index tried if tried[i]: j = tried[i][-1] + 1 else: j = i + 1 #nothing tried yet for 'i', so start with the next while j < len(clauses): #don't: 1) unify a clause with itself, # 2) use tautologies if i != j and j and not clauses[j].is_tautology(): tried[i].append(j) newclauses = clauses[i].unify(clauses[j]) if newclauses: for newclause in newclauses: newclause._parents = (i+1, j+1) clauses.append(newclause) if not len(newclause): #if there's an empty clause return (True, clauses) i=-1 #since we added a new clause, restart from the top break j += 1 i += 1 return (False, clauses) class ResolutionProverCommand(BaseProverCommand): def __init__(self, goal=None, assumptions=None, prover=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) """ if prover is not None: assert isinstance(prover, ResolutionProver) else: prover = ResolutionProver() BaseProverCommand.__init__(self, prover, goal, assumptions) self._clauses = None def prove(self, verbose=False): """ Perform the actual proof. Store the result to prevent unnecessary re-proving. """ if self._result is None: self._result, clauses = self._prover._prove(self.goal(), self.assumptions(), verbose) self._clauses = clauses self._proof = ResolutionProverCommand._decorate_clauses(clauses) return self._result def find_answers(self, verbose=False): self.prove(verbose) answers = set() answer_ex = VariableExpression(Variable(ResolutionProver.ANSWER_KEY)) for clause in self._clauses: for term in clause: if isinstance(term, ApplicationExpression) and\ term.function == answer_ex and\ not isinstance(term.argument, IndividualVariableExpression): answers.add(term.argument) return answers @staticmethod def _decorate_clauses(clauses): """ Decorate the proof output. """ out = '' max_clause_len = max([len(str(clause)) for clause in clauses]) max_seq_len = len(str(len(clauses))) for i in range(len(clauses)): parents = 'A' taut = '' if clauses[i].is_tautology(): taut = 'Tautology' if clauses[i]._parents: parents = str(clauses[i]._parents) parents = ' '*(max_clause_len-len(str(clauses[i]))+1) + parents seq = ' '*(max_seq_len-len(str(i+1))) + str(i+1) out += '[%s] %s %s %s\n' % (seq, clauses[i], parents, taut) return out @python_2_unicode_compatible class Clause(list): def __init__(self, data): list.__init__(self, data) self._is_tautology = None self._parents = None def unify(self, other, bindings=None, used=None, skipped=None, debug=False): """ Attempt to unify this Clause with the other, returning a list of resulting, unified, Clauses. :param other: ``Clause`` with which to unify :param bindings: ``BindingDict`` containing bindings that should be used during the unification :param used: tuple of two lists of atoms. The first lists the atoms from 'self' that were successfully unified with atoms from 'other'. The second lists the atoms from 'other' that were successfully unified with atoms from 'self'. :param skipped: tuple of two ``Clause`` objects. The first is a list of all the atoms from the 'self' Clause that have not been unified with anything on the path. The second is same thing for the 'other' Clause. :param debug: bool indicating whether debug statements should print :return: list containing all the resulting ``Clause`` objects that could be obtained by unification """ if bindings is None: bindings = BindingDict() if used is None: used = ([],[]) if skipped is None: skipped = ([],[]) if isinstance(debug, bool): debug = DebugObject(debug) newclauses = _iterate_first(self, other, bindings, used, skipped, _complete_unify_path, debug) #remove subsumed clauses. make a list of all indices of subsumed #clauses, and then remove them from the list subsumed = [] for i, c1 in enumerate(newclauses): if i not in subsumed: for j, c2 in enumerate(newclauses): if i!=j and j not in subsumed and c1.subsumes(c2): subsumed.append(j) result = [] for i in range(len(newclauses)): if i not in subsumed: result.append(newclauses[i]) return result def isSubsetOf(self, other): """ Return True iff every term in 'self' is a term in 'other'. :param other: ``Clause`` :return: bool """ for a in self: if a not in other: return False return True def subsumes(self, other): """ Return True iff 'self' subsumes 'other', this is, if there is a substitution such that every term in 'self' can be unified with a term in 'other'. :param other: ``Clause`` :return: bool """ negatedother = [] for atom in other: if isinstance(atom, NegatedExpression): negatedother.append(atom.term) else: negatedother.append(-atom) negatedotherClause = Clause(negatedother) bindings = BindingDict() used = ([],[]) skipped = ([],[]) debug = DebugObject(False) return len(_iterate_first(self, negatedotherClause, bindings, used, skipped, _subsumes_finalize, debug)) > 0 def __getslice__(self, start, end): return Clause(list.__getslice__(self, start, end)) def __sub__(self, other): return Clause([a for a in self if a not in other]) def __add__(self, other): return Clause(list.__add__(self, other)) def is_tautology(self): """ Self is a tautology if it contains ground terms P and -P. The ground term, P, must be an exact match, ie, not using unification. """ if self._is_tautology is not None: return self._is_tautology for i,a in enumerate(self): if not isinstance(a, EqualityExpression): j = len(self)-1 while j > i: b = self[j] if isinstance(a, NegatedExpression): if a.term == b: self._is_tautology = True return True elif isinstance(b, NegatedExpression): if a == b.term: self._is_tautology = True return True j -= 1 self._is_tautology = False return False def free(self): return reduce(operator.or_, ((atom.free() | atom.constants()) for atom in self)) def replace(self, variable, expression): """ Replace every instance of variable with expression across every atom in the clause :param variable: ``Variable`` :param expression: ``Expression`` """ return Clause([atom.replace(variable, expression) for atom in self]) def substitute_bindings(self, bindings): """ Replace every binding :param bindings: A list of tuples mapping Variable Expressions to the Expressions to which they are bound :return: ``Clause`` """ return Clause([atom.substitute_bindings(bindings) for atom in self]) def __str__(self): return '{' + ', '.join("%s" % item for item in self) + '}' def __repr__(self): return "%s" % self def _iterate_first(first, second, bindings, used, skipped, finalize_method, debug): """ This method facilitates movement through the terms of 'self' """ debug.line('unify(%s,%s) %s'%(first, second, bindings)) if not len(first) or not len(second): #if no more recursions can be performed return finalize_method(first, second, bindings, used, skipped, debug) else: #explore this 'self' atom result = _iterate_second(first, second, bindings, used, skipped, finalize_method, debug+1) #skip this possible 'self' atom newskipped = (skipped[0]+[first[0]], skipped[1]) result += _iterate_first(first[1:], second, bindings, used, newskipped, finalize_method, debug+1) try: newbindings, newused, unused = _unify_terms(first[0], second[0], bindings, used) #Unification found, so progress with this line of unification #put skipped and unused terms back into play for later unification. newfirst = first[1:] + skipped[0] + unused[0] newsecond = second[1:] + skipped[1] + unused[1] result += _iterate_first(newfirst, newsecond, newbindings, newused, ([],[]), finalize_method, debug+1) except BindingException: #the atoms could not be unified, pass return result def _iterate_second(first, second, bindings, used, skipped, finalize_method, debug): """ This method facilitates movement through the terms of 'other' """ debug.line('unify(%s,%s) %s'%(first, second, bindings)) if not len(first) or not len(second): #if no more recursions can be performed return finalize_method(first, second, bindings, used, skipped, debug) else: #skip this possible pairing and move to the next newskipped = (skipped[0], skipped[1]+[second[0]]) result = _iterate_second(first, second[1:], bindings, used, newskipped, finalize_method, debug+1) try: newbindings, newused, unused = _unify_terms(first[0], second[0], bindings, used) #Unification found, so progress with this line of unification #put skipped and unused terms back into play for later unification. newfirst = first[1:] + skipped[0] + unused[0] newsecond = second[1:] + skipped[1] + unused[1] result += _iterate_second(newfirst, newsecond, newbindings, newused, ([],[]), finalize_method, debug+1) except BindingException: #the atoms could not be unified, pass return result def _unify_terms(a, b, bindings=None, used=None): """ This method attempts to unify two terms. Two expressions are unifiable if there exists a substitution function S such that S(a) == S(-b). :param a: ``Expression`` :param b: ``Expression`` :param bindings: ``BindingDict`` a starting set of bindings with which the unification must be consistent :return: ``BindingDict`` A dictionary of the bindings required to unify :raise ``BindingException``: If the terms cannot be unified """ assert isinstance(a, Expression) assert isinstance(b, Expression) if bindings is None: bindings = BindingDict() if used is None: used = ([],[]) # Use resolution if isinstance(a, NegatedExpression) and isinstance(b, ApplicationExpression): newbindings = most_general_unification(a.term, b, bindings) newused = (used[0]+[a], used[1]+[b]) unused = ([],[]) elif isinstance(a, ApplicationExpression) and isinstance(b, NegatedExpression): newbindings = most_general_unification(a, b.term, bindings) newused = (used[0]+[a], used[1]+[b]) unused = ([],[]) # Use demodulation elif isinstance(a, EqualityExpression): newbindings = BindingDict([(a.first.variable, a.second)]) newused = (used[0]+[a], used[1]) unused = ([],[b]) elif isinstance(b, EqualityExpression): newbindings = BindingDict([(b.first.variable, b.second)]) newused = (used[0], used[1]+[b]) unused = ([a],[]) else: raise BindingException((a, b)) return newbindings, newused, unused def _complete_unify_path(first, second, bindings, used, skipped, debug): if used[0] or used[1]: #if bindings were made along the path newclause = Clause(skipped[0] + skipped[1] + first + second) debug.line(' -> New Clause: %s' % newclause) return [newclause.substitute_bindings(bindings)] else: #no bindings made means no unification occurred. so no result debug.line(' -> End') return [] def _subsumes_finalize(first, second, bindings, used, skipped, debug): if not len(skipped[0]) and not len(first): #If there are no skipped terms and no terms left in 'first', then #all of the terms in the original 'self' were unified with terms #in 'other'. Therefore, there exists a binding (this one) such that #every term in self can be unified with a term in other, which #is the definition of subsumption. return [True] else: return [] def clausify(expression): """ Skolemize, clausify, and standardize the variables apart. """ clause_list = [] for clause in _clausify(skolemize(expression)): for free in clause.free(): if is_indvar(free.name): newvar = VariableExpression(unique_variable()) clause = clause.replace(free, newvar) clause_list.append(clause) return clause_list def _clausify(expression): """ :param expression: a skolemized expression in CNF """ if isinstance(expression, AndExpression): return _clausify(expression.first) + _clausify(expression.second) elif isinstance(expression, OrExpression): first = _clausify(expression.first) second = _clausify(expression.second) assert len(first) == 1 assert len(second) == 1 return [first[0] + second[0]] elif isinstance(expression, EqualityExpression): return [Clause([expression])] elif isinstance(expression, ApplicationExpression): return [Clause([expression])] elif isinstance(expression, NegatedExpression): if isinstance(expression.term, ApplicationExpression): return [Clause([expression])] elif isinstance(expression.term, EqualityExpression): return [Clause([expression])] raise ProverParseError() @python_2_unicode_compatible class BindingDict(object): def __init__(self, binding_list=None): """ :param binding_list: list of (``AbstractVariableExpression``, ``AtomicExpression``) to initialize the dictionary """ self.d = {} if binding_list: for (v, b) in binding_list: self[v] = b def __setitem__(self, variable, binding): """ A binding is consistent with the dict if its variable is not already bound, OR if its variable is already bound to its argument. :param variable: ``Variable`` The variable to bind :param binding: ``Expression`` The atomic to which 'variable' should be bound :raise BindingException: If the variable cannot be bound in this dictionary """ assert isinstance(variable, Variable) assert isinstance(binding, Expression) try: existing = self[variable] except KeyError: existing = None if not existing or binding == existing: self.d[variable] = binding elif isinstance(binding, IndividualVariableExpression): # Since variable is already bound, try to bind binding to variable try: existing = self[binding.variable] except KeyError: existing = None binding2 = VariableExpression(variable) if not existing or binding2 == existing: self.d[binding.variable] = binding2 else: raise BindingException('Variable %s already bound to another ' 'value' % (variable)) else: raise BindingException('Variable %s already bound to another ' 'value' % (variable)) def __getitem__(self, variable): """ Return the expression to which 'variable' is bound """ assert isinstance(variable, Variable) intermediate = self.d[variable] while intermediate: try: intermediate = self.d[intermediate] except KeyError: return intermediate def __contains__(self, item): return item in self.d def __add__(self, other): """ :param other: ``BindingDict`` The dict with which to combine self :return: ``BindingDict`` A new dict containing all the elements of both parameters :raise BindingException: If the parameter dictionaries are not consistent with each other """ try: combined = BindingDict() for v in self.d: combined[v] = self.d[v] for v in other.d: combined[v] = other.d[v] return combined except BindingException: raise BindingException("Attempting to add two contradicting " "BindingDicts: '%s' and '%s'" % (self, other)) def __len__(self): return len(self.d) def __str__(self): data_str = ', '.join('%s: %s' % (v, self.d[v]) for v in sorted(self.d.keys())) return '{' + data_str + '}' def __repr__(self): return "%s" % self def most_general_unification(a, b, bindings=None): """ Find the most general unification of the two given expressions :param a: ``Expression`` :param b: ``Expression`` :param bindings: ``BindingDict`` a starting set of bindings with which the unification must be consistent :return: a list of bindings :raise BindingException: if the Expressions cannot be unified """ if bindings is None: bindings = BindingDict() if a == b: return bindings elif isinstance(a, IndividualVariableExpression): return _mgu_var(a, b, bindings) elif isinstance(b, IndividualVariableExpression): return _mgu_var(b, a, bindings) elif isinstance(a, ApplicationExpression) and\ isinstance(b, ApplicationExpression): return most_general_unification(a.function, b.function, bindings) +\ most_general_unification(a.argument, b.argument, bindings) raise BindingException((a, b)) def _mgu_var(var, expression, bindings): if var.variable in expression.free()|expression.constants(): raise BindingException((var, expression)) else: return BindingDict([(var.variable, expression)]) + bindings class BindingException(Exception): def __init__(self, arg): if isinstance(arg, tuple): Exception.__init__(self, "'%s' cannot be bound to '%s'" % arg) else: Exception.__init__(self, arg) class UnificationException(Exception): def __init__(self, a, b): Exception.__init__(self, "'%s' cannot unify with '%s'" % (a,b)) class DebugObject(object): def __init__(self, enabled=True, indent=0): self.enabled = enabled self.indent = indent def __add__(self, i): return DebugObject(self.enabled, self.indent+i) def line(self, line): if self.enabled: print(' '*self.indent + line) def testResolutionProver(): resolution_test(r'man(x)') resolution_test(r'(man(x) -> man(x))') resolution_test(r'(man(x) -> --man(x))') resolution_test(r'-(man(x) and -man(x))') resolution_test(r'(man(x) or -man(x))') resolution_test(r'(man(x) -> man(x))') resolution_test(r'-(man(x) and -man(x))') resolution_test(r'(man(x) or -man(x))') resolution_test(r'(man(x) -> man(x))') resolution_test(r'(man(x) iff man(x))') resolution_test(r'-(man(x) iff -man(x))') resolution_test('all x.man(x)') resolution_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))') resolution_test('some x.all y.sees(x,y)') p1 = Expression.fromstring(r'all x.(man(x) -> mortal(x))') p2 = Expression.fromstring(r'man(Socrates)') c = Expression.fromstring(r'mortal(Socrates)') print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1,p2]))) p1 = Expression.fromstring(r'all x.(man(x) -> walks(x))') p2 = Expression.fromstring(r'man(John)') c = Expression.fromstring(r'some y.walks(y)') print('%s, %s |- %s: %s' % (p1, p2, c, ResolutionProver().prove(c, [p1,p2]))) p = Expression.fromstring(r'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))') c = Expression.fromstring(r'some e0.walk(e0,mary)') print('%s |- %s: %s' % (p, c, ResolutionProver().prove(c, [p]))) def resolution_test(e): f = Expression.fromstring(e) t = ResolutionProver().prove(f) print('|- %s: %s' % (f, t)) def test_clausify(): lexpr = Expression.fromstring print(clausify(lexpr('P(x) | Q(x)'))) print(clausify(lexpr('(P(x) & Q(x)) | R(x)'))) print(clausify(lexpr('P(x) | (Q(x) & R(x))'))) print(clausify(lexpr('(P(x) & Q(x)) | (R(x) & S(x))'))) print(clausify(lexpr('P(x) | Q(x) | R(x)'))) print(clausify(lexpr('P(x) | (Q(x) & R(x)) | S(x)'))) print(clausify(lexpr('exists x.P(x) | Q(x)'))) print(clausify(lexpr('-(-P(x) & Q(x))'))) print(clausify(lexpr('P(x) <-> Q(x)'))) print(clausify(lexpr('-(P(x) <-> Q(x))'))) print(clausify(lexpr('-(all x.P(x))'))) print(clausify(lexpr('-(some x.P(x))'))) print(clausify(lexpr('some x.P(x)'))) print(clausify(lexpr('some x.all y.P(x,y)'))) print(clausify(lexpr('all y.some x.P(x,y)'))) print(clausify(lexpr('all z.all y.some x.P(x,y,z)'))) print(clausify(lexpr('all x.(all y.P(x,y) -> -all y.(Q(x,y) -> R(x,y)))'))) def demo(): test_clausify() print() testResolutionProver() print() p = Expression.fromstring('man(x)') print(ResolutionProverCommand(p, [p]).prove()) if __name__ == '__main__': demo() nltk-3.1/nltk/inference/tableau.py0000644000076500000240000006142412607224144016762 0ustar sbstaff00000000000000# Natural Language Toolkit: First-Order Tableau Theorem Prover # # Copyright (C) 2001-2015 NLTK Project # Author: Dan Garrette # # URL: # For license information, see LICENSE.TXT """ Module for a tableau-based First Order theorem prover. """ from __future__ import print_function, unicode_literals from nltk.internals import Counter from nltk.sem.logic import (VariableExpression, EqualityExpression, ApplicationExpression, Expression, AbstractVariableExpression, AllExpression, NegatedExpression, ExistsExpression, Variable, ImpExpression, AndExpression, unique_variable, LambdaExpression, IffExpression, OrExpression, FunctionVariableExpression) from nltk.inference.api import Prover, BaseProverCommand _counter = Counter() class ProverParseError(Exception): pass class TableauProver(Prover): _assume_false=False def _prove(self, goal=None, assumptions=None, verbose=False): if not assumptions: assumptions = [] result = None try: agenda = Agenda() if goal: agenda.put(-goal) agenda.put_all(assumptions) debugger = Debug(verbose) result = self._attempt_proof(agenda, set(), set(), debugger) except RuntimeError as e: if self._assume_false and str(e).startswith('maximum recursion depth exceeded'): result = False else: if verbose: print(e) else: raise e return (result, '\n'.join(debugger.lines)) def _attempt_proof(self, agenda, accessible_vars, atoms, debug): (current, context), category = agenda.pop_first() #if there's nothing left in the agenda, and we haven't closed the path if not current: debug.line('AGENDA EMPTY') return False proof_method = { Categories.ATOM: self._attempt_proof_atom, Categories.PROP: self._attempt_proof_prop, Categories.N_ATOM: self._attempt_proof_n_atom, Categories.N_PROP: self._attempt_proof_n_prop, Categories.APP: self._attempt_proof_app, Categories.N_APP: self._attempt_proof_n_app, Categories.N_EQ: self._attempt_proof_n_eq, Categories.D_NEG: self._attempt_proof_d_neg, Categories.N_ALL: self._attempt_proof_n_all, Categories.N_EXISTS: self._attempt_proof_n_some, Categories.AND: self._attempt_proof_and, Categories.N_OR: self._attempt_proof_n_or, Categories.N_IMP: self._attempt_proof_n_imp, Categories.OR: self._attempt_proof_or, Categories.IMP: self._attempt_proof_imp, Categories.N_AND: self._attempt_proof_n_and, Categories.IFF: self._attempt_proof_iff, Categories.N_IFF: self._attempt_proof_n_iff, Categories.EQ: self._attempt_proof_eq, Categories.EXISTS: self._attempt_proof_some, Categories.ALL: self._attempt_proof_all, }[category] debug.line((current, context)) return proof_method(current, context, agenda, accessible_vars, atoms, debug) def _attempt_proof_atom(self, current, context, agenda, accessible_vars, atoms, debug): # Check if the branch is closed. Return 'True' if it is if (current, True) in atoms: debug.line('CLOSED', 1) return True if context: if isinstance(context.term, NegatedExpression): current = current.negate() agenda.put(context(current).simplify()) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) else: #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars agenda.mark_alls_fresh(); return self._attempt_proof(agenda, accessible_vars|set(current.args), atoms|set([(current, False)]), debug+1) def _attempt_proof_n_atom(self, current, context, agenda, accessible_vars, atoms, debug): # Check if the branch is closed. Return 'True' if it is if (current.term, False) in atoms: debug.line('CLOSED', 1) return True if context: if isinstance(context.term, NegatedExpression): current = current.negate() agenda.put(context(current).simplify()) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) else: #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars agenda.mark_alls_fresh(); return self._attempt_proof(agenda, accessible_vars|set(current.term.args), atoms|set([(current.term, True)]), debug+1) def _attempt_proof_prop(self, current, context, agenda, accessible_vars, atoms, debug): # Check if the branch is closed. Return 'True' if it is if (current, True) in atoms: debug.line('CLOSED', 1) return True #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars agenda.mark_alls_fresh(); return self._attempt_proof(agenda, accessible_vars, atoms|set([(current, False)]), debug+1) def _attempt_proof_n_prop(self, current, context, agenda, accessible_vars, atoms, debug): # Check if the branch is closed. Return 'True' if it is if (current.term, False) in atoms: debug.line('CLOSED', 1) return True #mark all AllExpressions as 'not exhausted' into the agenda since we are (potentially) adding new accessible vars agenda.mark_alls_fresh(); return self._attempt_proof(agenda, accessible_vars, atoms|set([(current.term, True)]), debug+1) def _attempt_proof_app(self, current, context, agenda, accessible_vars, atoms, debug): f, args = current.uncurry() for i, arg in enumerate(args): if not TableauProver.is_atom(arg): ctx = f nv = Variable('X%s' % _counter.get()) for j,a in enumerate(args): ctx = (ctx(VariableExpression(nv)) if i == j else ctx(a)) if context: ctx = context(ctx).simplify() ctx = LambdaExpression(nv, ctx) agenda.put(arg, ctx) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) raise Exception('If this method is called, there must be a non-atomic argument') def _attempt_proof_n_app(self, current, context, agenda, accessible_vars, atoms, debug): f, args = current.term.uncurry() for i, arg in enumerate(args): if not TableauProver.is_atom(arg): ctx = f nv = Variable('X%s' % _counter.get()) for j,a in enumerate(args): ctx = (ctx(VariableExpression(nv)) if i == j else ctx(a)) if context: #combine new context with existing ctx = context(ctx).simplify() ctx = LambdaExpression(nv, -ctx) agenda.put(-arg, ctx) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) raise Exception('If this method is called, there must be a non-atomic argument') def _attempt_proof_n_eq(self, current, context, agenda, accessible_vars, atoms, debug): ########################################################################### # Since 'current' is of type '~(a=b)', the path is closed if 'a' == 'b' ########################################################################### if current.term.first == current.term.second: debug.line('CLOSED', 1) return True agenda[Categories.N_EQ].add((current,context)) current._exhausted = True return self._attempt_proof(agenda, accessible_vars|set([current.term.first, current.term.second]), atoms, debug+1) def _attempt_proof_d_neg(self, current, context, agenda, accessible_vars, atoms, debug): agenda.put(current.term.term, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) def _attempt_proof_n_all(self, current, context, agenda, accessible_vars, atoms, debug): agenda[Categories.EXISTS].add((ExistsExpression(current.term.variable, -current.term.term), context)) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) def _attempt_proof_n_some(self, current, context, agenda, accessible_vars, atoms, debug): agenda[Categories.ALL].add((AllExpression(current.term.variable, -current.term.term), context)) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) def _attempt_proof_and(self, current, context, agenda, accessible_vars, atoms, debug): agenda.put(current.first, context) agenda.put(current.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) def _attempt_proof_n_or(self, current, context, agenda, accessible_vars, atoms, debug): agenda.put(-current.term.first, context) agenda.put(-current.term.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) def _attempt_proof_n_imp(self, current, context, agenda, accessible_vars, atoms, debug): agenda.put(current.term.first, context) agenda.put(-current.term.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) def _attempt_proof_or(self, current, context, agenda, accessible_vars, atoms, debug): new_agenda = agenda.clone() agenda.put(current.first, context) new_agenda.put(current.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1) def _attempt_proof_imp(self, current, context, agenda, accessible_vars, atoms, debug): new_agenda = agenda.clone() agenda.put(-current.first, context) new_agenda.put(current.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1) def _attempt_proof_n_and(self, current, context, agenda, accessible_vars, atoms, debug): new_agenda = agenda.clone() agenda.put(-current.term.first, context) new_agenda.put(-current.term.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1) def _attempt_proof_iff(self, current, context, agenda, accessible_vars, atoms, debug): new_agenda = agenda.clone() agenda.put(current.first, context) agenda.put(current.second, context) new_agenda.put(-current.first, context) new_agenda.put(-current.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1) def _attempt_proof_n_iff(self, current, context, agenda, accessible_vars, atoms, debug): new_agenda = agenda.clone() agenda.put(current.term.first, context) agenda.put(-current.term.second, context) new_agenda.put(-current.term.first, context) new_agenda.put(current.term.second, context) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) and \ self._attempt_proof(new_agenda, accessible_vars, atoms, debug+1) def _attempt_proof_eq(self, current, context, agenda, accessible_vars, atoms, debug): ######################################################################### # Since 'current' is of the form '(a = b)', replace ALL free instances # of 'a' with 'b' ######################################################################### agenda.put_atoms(atoms) agenda.replace_all(current.first, current.second) accessible_vars.discard(current.first) agenda.mark_neqs_fresh(); return self._attempt_proof(agenda, accessible_vars, set(), debug+1) def _attempt_proof_some(self, current, context, agenda, accessible_vars, atoms, debug): new_unique_variable = VariableExpression(unique_variable()) agenda.put(current.term.replace(current.variable, new_unique_variable), context) agenda.mark_alls_fresh() return self._attempt_proof(agenda, accessible_vars|set([new_unique_variable]), atoms, debug+1) def _attempt_proof_all(self, current, context, agenda, accessible_vars, atoms, debug): try: current._used_vars except AttributeError: current._used_vars = set() #if there are accessible_vars on the path if accessible_vars: # get the set of bound variables that have not be used by this AllExpression bv_available = accessible_vars - current._used_vars if bv_available: variable_to_use = list(bv_available)[0] debug.line('--> Using \'%s\'' % variable_to_use, 2) current._used_vars |= set([variable_to_use]) agenda.put(current.term.replace(current.variable, variable_to_use), context) agenda[Categories.ALL].add((current,context)) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) else: #no more available variables to substitute debug.line('--> Variables Exhausted', 2) current._exhausted = True agenda[Categories.ALL].add((current,context)) return self._attempt_proof(agenda, accessible_vars, atoms, debug+1) else: new_unique_variable = VariableExpression(unique_variable()) debug.line('--> Using \'%s\'' % new_unique_variable, 2) current._used_vars |= set([new_unique_variable]) agenda.put(current.term.replace(current.variable, new_unique_variable), context) agenda[Categories.ALL].add((current,context)) agenda.mark_alls_fresh() return self._attempt_proof(agenda, accessible_vars|set([new_unique_variable]), atoms, debug+1) @staticmethod def is_atom(e): if isinstance(e, NegatedExpression): e = e.term if isinstance(e, ApplicationExpression): for arg in e.args: if not TableauProver.is_atom(arg): return False return True elif isinstance(e, AbstractVariableExpression) or \ isinstance(e, LambdaExpression): return True else: return False class TableauProverCommand(BaseProverCommand): def __init__(self, goal=None, assumptions=None, prover=None): """ :param goal: Input expression to prove :type goal: sem.Expression :param assumptions: Input expressions to use as assumptions in the proof. :type assumptions: list(sem.Expression) """ if prover is not None: assert isinstance(prover, TableauProver) else: prover = TableauProver() BaseProverCommand.__init__(self, prover, goal, assumptions) class Agenda(object): def __init__(self): self.sets = tuple(set() for i in range(21)) def clone(self): new_agenda = Agenda() set_list = [s.copy() for s in self.sets] new_allExs = set() for allEx,_ in set_list[Categories.ALL]: new_allEx = AllExpression(allEx.variable, allEx.term) try: new_allEx._used_vars = set(used for used in allEx._used_vars) except AttributeError: new_allEx._used_vars = set() new_allExs.add((new_allEx,None)) set_list[Categories.ALL] = new_allExs set_list[Categories.N_EQ] = set((NegatedExpression(n_eq.term),ctx) for (n_eq,ctx) in set_list[Categories.N_EQ]) new_agenda.sets = tuple(set_list) return new_agenda def __getitem__(self, index): return self.sets[index] def put(self, expression, context=None): if isinstance(expression, AllExpression): ex_to_add = AllExpression(expression.variable, expression.term) try: ex_to_add._used_vars = set(used for used in expression._used_vars) except AttributeError: ex_to_add._used_vars = set() else: ex_to_add = expression self.sets[self._categorize_expression(ex_to_add)].add((ex_to_add, context)) def put_all(self, expressions): for expression in expressions: self.put(expression) def put_atoms(self, atoms): for atom, neg in atoms: if neg: self[Categories.N_ATOM].add((-atom,None)) else: self[Categories.ATOM].add((atom,None)) def pop_first(self): """ Pop the first expression that appears in the agenda """ for i,s in enumerate(self.sets): if s: if i in [Categories.N_EQ, Categories.ALL]: for ex in s: try: if not ex[0]._exhausted: s.remove(ex) return (ex, i) except AttributeError: s.remove(ex) return (ex, i) else: return (s.pop(), i) return ((None, None), None) def replace_all(self, old, new): for s in self.sets: for ex,ctx in s: ex.replace(old.variable, new) if ctx is not None: ctx.replace(old.variable, new) def mark_alls_fresh(self): for u,_ in self.sets[Categories.ALL]: u._exhausted = False def mark_neqs_fresh(self): for neq,_ in self.sets[Categories.N_EQ]: neq._exhausted = False def _categorize_expression(self, current): if isinstance(current, NegatedExpression): return self._categorize_NegatedExpression(current) elif isinstance(current, FunctionVariableExpression): return Categories.PROP elif TableauProver.is_atom(current): return Categories.ATOM elif isinstance(current, AllExpression): return Categories.ALL elif isinstance(current, AndExpression): return Categories.AND elif isinstance(current, OrExpression): return Categories.OR elif isinstance(current, ImpExpression): return Categories.IMP elif isinstance(current, IffExpression): return Categories.IFF elif isinstance(current, EqualityExpression): return Categories.EQ elif isinstance(current, ExistsExpression): return Categories.EXISTS elif isinstance(current, ApplicationExpression): return Categories.APP else: raise ProverParseError("cannot categorize %s" % \ current.__class__.__name__) def _categorize_NegatedExpression(self, current): negated = current.term if isinstance(negated, NegatedExpression): return Categories.D_NEG elif isinstance(negated, FunctionVariableExpression): return Categories.N_PROP elif TableauProver.is_atom(negated): return Categories.N_ATOM elif isinstance(negated, AllExpression): return Categories.N_ALL elif isinstance(negated, AndExpression): return Categories.N_AND elif isinstance(negated, OrExpression): return Categories.N_OR elif isinstance(negated, ImpExpression): return Categories.N_IMP elif isinstance(negated, IffExpression): return Categories.N_IFF elif isinstance(negated, EqualityExpression): return Categories.N_EQ elif isinstance(negated, ExistsExpression): return Categories.N_EXISTS elif isinstance(negated, ApplicationExpression): return Categories.N_APP else: raise ProverParseError("cannot categorize %s" % \ negated.__class__.__name__) class Debug(object): def __init__(self, verbose, indent=0, lines=None): self.verbose = verbose self.indent = indent if not lines: lines = [] self.lines = lines def __add__(self, increment): return Debug(self.verbose, self.indent+1, self.lines) def line(self, data, indent=0): if isinstance(data, tuple): ex, ctx = data if ctx: data = '%s, %s' % (ex, ctx) else: data = '%s' % ex if isinstance(ex, AllExpression): try: used_vars = "[%s]" % (",".join("%s" % ve.variable.name for ve in ex._used_vars)) data += ': %s' % used_vars except AttributeError: data += ': []' newline = '%s%s' % (' '*(self.indent+indent), data) self.lines.append(newline) if self.verbose: print(newline) class Categories(object): ATOM = 0 PROP = 1 N_ATOM = 2 N_PROP = 3 APP = 4 N_APP = 5 N_EQ = 6 D_NEG = 7 N_ALL = 8 N_EXISTS = 9 AND = 10 N_OR = 11 N_IMP = 12 OR = 13 IMP = 14 N_AND = 15 IFF = 16 N_IFF = 17 EQ = 18 EXISTS = 19 ALL = 20 def testTableauProver(): tableau_test('P | -P') tableau_test('P & -P') tableau_test('Q', ['P', '(P -> Q)']) tableau_test('man(x)') tableau_test('(man(x) -> man(x))') tableau_test('(man(x) -> --man(x))') tableau_test('-(man(x) and -man(x))') tableau_test('(man(x) or -man(x))') tableau_test('(man(x) -> man(x))') tableau_test('-(man(x) and -man(x))') tableau_test('(man(x) or -man(x))') tableau_test('(man(x) -> man(x))') tableau_test('(man(x) iff man(x))') tableau_test('-(man(x) iff -man(x))') tableau_test('all x.man(x)') tableau_test('all x.all y.((x = y) -> (y = x))') tableau_test('all x.all y.all z.(((x = y) & (y = z)) -> (x = z))') # tableau_test('-all x.some y.F(x,y) & some x.all y.(-F(x,y))') # tableau_test('some x.all y.sees(x,y)') p1 = 'all x.(man(x) -> mortal(x))' p2 = 'man(Socrates)' c = 'mortal(Socrates)' tableau_test(c, [p1, p2]) p1 = 'all x.(man(x) -> walks(x))' p2 = 'man(John)' c = 'some y.walks(y)' tableau_test(c, [p1, p2]) p = '((x = y) & walks(y))' c = 'walks(x)' tableau_test(c, [p]) p = '((x = y) & ((y = z) & (z = w)))' c = '(x = w)' tableau_test(c, [p]) p = 'some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))' c = 'some e0.walk(e0,mary)' tableau_test(c, [p]) c = '(exists x.exists z3.((x = Mary) & ((z3 = John) & sees(z3,x))) <-> exists x.exists z4.((x = John) & ((z4 = Mary) & sees(x,z4))))' tableau_test(c) # p = 'some e1.some e2.((believe e1 john e2) and (walk e2 mary))' # c = 'some x.some e3.some e4.((believe e3 x e4) and (walk e4 mary))' # tableau_test(c, [p]) def testHigherOrderTableauProver(): tableau_test('believe(j, -lie(b))', ['believe(j, -lie(b) & -cheat(b))']) tableau_test('believe(j, lie(b) & cheat(b))', ['believe(j, lie(b))']) tableau_test('believe(j, lie(b))', ['lie(b)']) #how do we capture that John believes all things that are true tableau_test('believe(j, know(b, cheat(b)))', ['believe(j, know(b, lie(b)) & know(b, steals(b) & cheat(b)))']) tableau_test('P(Q(y), R(y) & R(z))', ['P(Q(x) & Q(y), R(y) & R(z))']) tableau_test('believe(j, cheat(b) & lie(b))', ['believe(j, lie(b) & cheat(b))']) tableau_test('believe(j, -cheat(b) & -lie(b))', ['believe(j, -lie(b) & -cheat(b))']) def tableau_test(c, ps=None, verbose=False): pc = Expression.fromstring(c) pps = ([Expression.fromstring(p) for p in ps] if ps else []) if not ps: ps = [] print('%s |- %s: %s' % (', '.join(ps), pc, TableauProver().prove(pc, pps, verbose=verbose))) def demo(): testTableauProver() testHigherOrderTableauProver() if __name__ == '__main__': demo() nltk-3.1/nltk/internals.py0000644000076500000240000010566512607224144015414 0ustar sbstaff00000000000000# Natural Language Toolkit: Internal utility functions # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # Nitin Madnani # URL: # For license information, see LICENSE.TXT from __future__ import print_function import subprocess import os import fnmatch import re import warnings import textwrap import types import sys import stat import locale # Use the c version of ElementTree, which is faster, if possible: try: from xml.etree import cElementTree as ElementTree except ImportError: from xml.etree import ElementTree from nltk import __file__ from nltk import compat ########################################################################## # Java Via Command-Line ########################################################################## _java_bin = None _java_options = [] # [xx] add classpath option to config_java? def config_java(bin=None, options=None, verbose=True): """ Configure nltk's java interface, by letting nltk know where it can find the Java binary, and what extra options (if any) should be passed to Java when it is run. :param bin: The full path to the Java binary. If not specified, then nltk will search the system for a Java binary; and if one is not found, it will raise a ``LookupError`` exception. :type bin: str :param options: A list of options that should be passed to the Java binary when it is called. A common value is ``'-Xmx512m'``, which tells Java binary to increase the maximum heap size to 512 megabytes. If no options are specified, then do not modify the options list. :type options: list(str) """ global _java_bin, _java_options _java_bin = find_binary('java', bin, env_vars=['JAVAHOME', 'JAVA_HOME'], verbose=verbose, binary_names=['java.exe']) if options is not None: if isinstance(options, compat.string_types): options = options.split() _java_options = list(options) def java(cmd, classpath=None, stdin=None, stdout=None, stderr=None, blocking=True): """ Execute the given java command, by opening a subprocess that calls Java. If java has not yet been configured, it will be configured by calling ``config_java()`` with no arguments. :param cmd: The java command that should be called, formatted as a list of strings. Typically, the first string will be the name of the java class; and the remaining strings will be arguments for that java class. :type cmd: list(str) :param classpath: A ``':'`` separated list of directories, JAR archives, and ZIP archives to search for class files. :type classpath: str :param stdin, stdout, stderr: Specify the executed programs' standard input, standard output and standard error file handles, respectively. Valid values are ``subprocess.PIPE``, an existing file descriptor (a positive integer), an existing file object, and None. ``subprocess.PIPE`` indicates that a new pipe to the child should be created. With None, no redirection will occur; the child's file handles will be inherited from the parent. Additionally, stderr can be ``subprocess.STDOUT``, which indicates that the stderr data from the applications should be captured into the same file handle as for stdout. :param blocking: If ``false``, then return immediately after spawning the subprocess. In this case, the return value is the ``Popen`` object, and not a ``(stdout, stderr)`` tuple. :return: If ``blocking=True``, then return a tuple ``(stdout, stderr)``, containing the stdout and stderr outputs generated by the java command if the ``stdout`` and ``stderr`` parameters were set to ``subprocess.PIPE``; or None otherwise. If ``blocking=False``, then return a ``subprocess.Popen`` object. :raise OSError: If the java command returns a nonzero return code. """ if stdin == 'pipe': stdin = subprocess.PIPE if stdout == 'pipe': stdout = subprocess.PIPE if stderr == 'pipe': stderr = subprocess.PIPE if isinstance(cmd, compat.string_types): raise TypeError('cmd should be a list of strings') # Make sure we know where a java binary is. if _java_bin is None: config_java() # Set up the classpath. if isinstance(classpath, compat.string_types): classpaths=[classpath] else: classpaths=list(classpath) classpath=os.path.pathsep.join(classpaths) # Construct the full command string. cmd = list(cmd) cmd = ['-cp', classpath] + cmd cmd = [_java_bin] + _java_options + cmd # Call java via a subprocess p = subprocess.Popen(cmd, stdin=stdin, stdout=stdout, stderr=stderr) if not blocking: return p (stdout, stderr) = p.communicate() # Check the return code. if p.returncode != 0: print(_decode_stdoutdata(stderr)) raise OSError('Java command failed : ' + str(cmd)) return (stdout, stderr) if 0: #config_java(options='-Xmx512m') # Write: #java('weka.classifiers.bayes.NaiveBayes', # ['-d', '/tmp/names.model', '-t', '/tmp/train.arff'], # classpath='/Users/edloper/Desktop/weka/weka.jar') # Read: (a,b) = java(['weka.classifiers.bayes.NaiveBayes', '-l', '/tmp/names.model', '-T', '/tmp/test.arff', '-p', '0'],#, '-distribution'], classpath='/Users/edloper/Desktop/weka/weka.jar') ###################################################################### # Parsing ###################################################################### class ReadError(ValueError): """ Exception raised by read_* functions when they fail. :param position: The index in the input string where an error occurred. :param expected: What was expected when an error occurred. """ def __init__(self, expected, position): ValueError.__init__(self, expected, position) self.expected = expected self.position = position def __str__(self): return 'Expected %s at %s' % (self.expected, self.position) _STRING_START_RE = re.compile(r"[uU]?[rR]?(\"\"\"|\'\'\'|\"|\')") def read_str(s, start_position): """ If a Python string literal begins at the specified position in the given string, then return a tuple ``(val, end_position)`` containing the value of the string literal and the position where it ends. Otherwise, raise a ``ReadError``. """ # Read the open quote, and any modifiers. m = _STRING_START_RE.match(s, start_position) if not m: raise ReadError('open quote', start_position) quotemark = m.group(1) # Find the close quote. _STRING_END_RE = re.compile(r'\\|%s' % quotemark) position = m.end() while True: match = _STRING_END_RE.search(s, position) if not match: raise ReadError('close quote', position) if match.group(0) == '\\': position = match.end()+1 else: break # Process it, using eval. Strings with invalid escape sequences # might raise ValueEerror. try: return eval(s[start_position:match.end()]), match.end() except ValueError as e: raise ReadError('invalid string (%s)' % e) _READ_INT_RE = re.compile(r'-?\d+') def read_int(s, start_position): """ If an integer begins at the specified position in the given string, then return a tuple ``(val, end_position)`` containing the value of the integer and the position where it ends. Otherwise, raise a ``ReadError``. """ m = _READ_INT_RE.match(s, start_position) if not m: raise ReadError('integer', start_position) return int(m.group()), m.end() _READ_NUMBER_VALUE = re.compile(r'-?(\d*)([.]?\d*)?') def read_number(s, start_position): """ If an integer or float begins at the specified position in the given string, then return a tuple ``(val, end_position)`` containing the value of the number and the position where it ends. Otherwise, raise a ``ReadError``. """ m = _READ_NUMBER_VALUE.match(s, start_position) if not m or not (m.group(1) or m.group(2)): raise ReadError('number', start_position) if m.group(2): return float(m.group()), m.end() else: return int(m.group()), m.end() ###################################################################### # Check if a method has been overridden ###################################################################### def overridden(method): """ :return: True if ``method`` overrides some method with the same name in a base class. This is typically used when defining abstract base classes or interfaces, to allow subclasses to define either of two related methods: >>> class EaterI: ... '''Subclass must define eat() or batch_eat().''' ... def eat(self, food): ... if overridden(self.batch_eat): ... return self.batch_eat([food])[0] ... else: ... raise NotImplementedError() ... def batch_eat(self, foods): ... return [self.eat(food) for food in foods] :type method: instance method """ # [xx] breaks on classic classes! if isinstance(method, types.MethodType) and compat.get_im_class(method) is not None: name = method.__name__ funcs = [cls.__dict__[name] for cls in _mro(compat.get_im_class(method)) if name in cls.__dict__] return len(funcs) > 1 else: raise TypeError('Expected an instance method.') def _mro(cls): """ Return the method resolution order for ``cls`` -- i.e., a list containing ``cls`` and all its base classes, in the order in which they would be checked by ``getattr``. For new-style classes, this is just cls.__mro__. For classic classes, this can be obtained by a depth-first left-to-right traversal of ``__bases__``. """ if isinstance(cls, type): return cls.__mro__ else: mro = [cls] for base in cls.__bases__: mro.extend(_mro(base)) return mro ###################################################################### # Deprecation decorator & base class ###################################################################### # [xx] dedent msg first if it comes from a docstring. def _add_epytext_field(obj, field, message): """Add an epytext @field to a given object's docstring.""" indent = '' # If we already have a docstring, then add a blank line to separate # it from the new field, and check its indentation. if obj.__doc__: obj.__doc__ = obj.__doc__.rstrip()+'\n\n' indents = re.findall(r'(?<=\n)[ ]+(?!\s)', obj.__doc__.expandtabs()) if indents: indent = min(indents) # If we don't have a docstring, add an empty one. else: obj.__doc__ = '' obj.__doc__ += textwrap.fill('@%s: %s' % (field, message), initial_indent=indent, subsequent_indent=indent+' ') def deprecated(message): """ A decorator used to mark functions as deprecated. This will cause a warning to be printed the when the function is used. Usage: >>> from nltk.internals import deprecated >>> @deprecated('Use foo() instead') ... def bar(x): ... print(x/10) """ def decorator(func): msg = ("Function %s() has been deprecated. %s" % (func.__name__, message)) msg = '\n' + textwrap.fill(msg, initial_indent=' ', subsequent_indent=' ') def newFunc(*args, **kwargs): warnings.warn(msg, category=DeprecationWarning, stacklevel=2) return func(*args, **kwargs) # Copy the old function's name, docstring, & dict newFunc.__dict__.update(func.__dict__) newFunc.__name__ = func.__name__ newFunc.__doc__ = func.__doc__ newFunc.__deprecated__ = True # Add a @deprecated field to the docstring. _add_epytext_field(newFunc, 'deprecated', message) return newFunc return decorator class Deprecated(object): """ A base class used to mark deprecated classes. A typical usage is to alert users that the name of a class has changed: >>> from nltk.internals import Deprecated >>> class NewClassName(object): ... pass # All logic goes here. ... >>> class OldClassName(Deprecated, NewClassName): ... "Use NewClassName instead." The docstring of the deprecated class will be used in the deprecation warning message. """ def __new__(cls, *args, **kwargs): # Figure out which class is the deprecated one. dep_cls = None for base in _mro(cls): if Deprecated in base.__bases__: dep_cls = base; break assert dep_cls, 'Unable to determine which base is deprecated.' # Construct an appropriate warning. doc = dep_cls.__doc__ or ''.strip() # If there's a @deprecated field, strip off the field marker. doc = re.sub(r'\A\s*@deprecated:', r'', doc) # Strip off any indentation. doc = re.sub(r'(?m)^\s*', '', doc) # Construct a 'name' string. name = 'Class %s' % dep_cls.__name__ if cls != dep_cls: name += ' (base class for %s)' % cls.__name__ # Put it all together. msg = '%s has been deprecated. %s' % (name, doc) # Wrap it. msg = '\n' + textwrap.fill(msg, initial_indent=' ', subsequent_indent=' ') warnings.warn(msg, category=DeprecationWarning, stacklevel=2) # Do the actual work of __new__. return object.__new__(cls) ########################################################################## # COUNTER, FOR UNIQUE NAMING ########################################################################## class Counter: """ A counter that auto-increments each time its value is read. """ def __init__(self, initial_value=0): self._value = initial_value def get(self): self._value += 1 return self._value ########################################################################## # Search for files/binaries ########################################################################## def find_file_iter(filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=True, finding_dir=False): """ Search for a file to be used by nltk. :param filename: The name or path of the file. :param env_vars: A list of environment variable names to check. :param file_names: A list of alternative file names to check. :param searchpath: List of directories to search. :param url: URL presented to user for download help. :param verbose: Whether or not to print path when a file is found. """ file_names = [filename] + (file_names or []) assert isinstance(filename, compat.string_types) assert not isinstance(file_names, compat.string_types) assert not isinstance(searchpath, compat.string_types) if isinstance(env_vars, compat.string_types): env_vars = env_vars.split() yielded = False # File exists, no magic for alternative in file_names: path_to_file = os.path.join(filename, alternative) if os.path.isfile(path_to_file): if verbose: print('[Found %s: %s]' % (filename, path_to_file)) yielded = True yield path_to_file # Check the bare alternatives if os.path.isfile(alternative): if verbose: print('[Found %s: %s]' % (filename, alternative)) yielded = True yield alternative # Check if the alternative is inside a 'file' directory path_to_file = os.path.join(filename, 'file', alternative) if os.path.isfile(path_to_file): if verbose: print('[Found %s: %s]' % (filename, path_to_file)) yielded = True yield path_to_file # Check environment variables for env_var in env_vars: if env_var in os.environ: if finding_dir: # This is to file a directory instead of file yielded = True yield os.environ[env_var] for env_dir in os.environ[env_var].split(os.pathsep): # Check if the environment variable contains a direct path to the bin if os.path.isfile(env_dir): if verbose: print('[Found %s: %s]'%(filename, env_dir)) yielded = True yield env_dir # Check if the possible bin names exist inside the environment variable directories for alternative in file_names: path_to_file = os.path.join(env_dir, alternative) if os.path.isfile(path_to_file): if verbose: print('[Found %s: %s]'%(filename, path_to_file)) yielded = True yield path_to_file # Check if the alternative is inside a 'file' directory # path_to_file = os.path.join(env_dir, 'file', alternative) # Check if the alternative is inside a 'bin' directory path_to_file = os.path.join(env_dir, 'bin', alternative) if os.path.isfile(path_to_file): if verbose: print('[Found %s: %s]' % (filename, path_to_file)) yielded = True yield path_to_file # Check the path list. for directory in searchpath: for alternative in file_names: path_to_file = os.path.join(directory, alternative) if os.path.isfile(path_to_file): yielded = True yield path_to_file # If we're on a POSIX system, then try using the 'which' command # to find the file. if os.name == 'posix': for alternative in file_names: try: p = subprocess.Popen(['which', alternative], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() path = _decode_stdoutdata(stdout).strip() if path.endswith(alternative) and os.path.exists(path): if verbose: print('[Found %s: %s]' % (filename, path)) yielded = True yield path except (KeyboardInterrupt, SystemExit, OSError): raise except: pass if not yielded: msg = ("NLTK was unable to find the %s file!" "\nUse software specific " "configuration paramaters" % filename) if env_vars: msg += ' or set the %s environment variable' % env_vars[0] msg += '.' if searchpath: msg += '\n\n Searched in:' msg += ''.join('\n - %s' % d for d in searchpath) if url: msg += ('\n\n For more information on %s, see:\n <%s>' % (filename, url)) div = '='*75 raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div)) def find_file(filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=True): return next(find_file_iter(filename, env_vars, searchpath, file_names, url, verbose)) def find_dir(filename, env_vars=(), searchpath=(), file_names=None, url=None, verbose=True): return next(find_file_iter(filename, env_vars, searchpath, file_names, url, verbose, finding_dir=True)) def find_binary_iter(name, path_to_bin=None, env_vars=(), searchpath=(), binary_names=None, url=None, verbose=True): """ Search for a file to be used by nltk. :param name: The name or path of the file. :param path_to_bin: The user-supplied binary location (deprecated) :param env_vars: A list of environment variable names to check. :param file_names: A list of alternative file names to check. :param searchpath: List of directories to search. :param url: URL presented to user for download help. :param verbose: Whether or not to print path when a file is found. """ for file in find_file_iter(path_to_bin or name, env_vars, searchpath, binary_names, url, verbose): yield file def find_binary(name, path_to_bin=None, env_vars=(), searchpath=(), binary_names=None, url=None, verbose=True): return next(find_binary_iter(name, path_to_bin, env_vars, searchpath, binary_names, url, verbose)) def find_jar_iter(name_pattern, path_to_jar=None, env_vars=(), searchpath=(), url=None, verbose=True, is_regex=False): """ Search for a jar that is used by nltk. :param name_pattern: The name of the jar file :param path_to_jar: The user-supplied jar location, or None. :param env_vars: A list of environment variable names to check in addition to the CLASSPATH variable which is checked by default. :param searchpath: List of directories to search. :param is_regex: Whether name is a regular expression. """ assert isinstance(name_pattern, compat.string_types) assert not isinstance(searchpath, compat.string_types) if isinstance(env_vars, compat.string_types): env_vars = env_vars.split() yielded = False # Make sure we check the CLASSPATH first env_vars = ['CLASSPATH'] + list(env_vars) # If an explicit location was given, then check it, and yield it if # it's present; otherwise, complain. if path_to_jar is not None: if os.path.isfile(path_to_jar): yielded = True yield path_to_jar else: raise LookupError('Could not find %s jar file at %s' % (name_pattern, path_to_jar)) # Check environment variables for env_var in env_vars: if env_var in os.environ: if env_var == 'CLASSPATH': classpath = os.environ['CLASSPATH'] for cp in classpath.split(os.path.pathsep): if os.path.isfile(cp): filename=os.path.basename(cp) if is_regex and re.match(name_pattern, filename) or \ (not is_regex and filename == name_pattern): if verbose: print('[Found %s: %s]' % (name_pattern, cp)) yielded = True yield cp # The case where user put directory containing the jar file in the classpath if os.path.isdir(cp): if not is_regex: if os.path.isfile(os.path.join(cp,name_pattern)): if verbose: print('[Found %s: %s]' % (name_pattern, cp)) yielded = True yield os.path.join(cp,name_pattern) else: # Look for file using regular expression for file_name in os.listdir(cp): if re.match(name_pattern,file_name): if verbose: print('[Found %s: %s]' % (name_pattern, os.path.join(cp,file_name))) yielded = True yield os.path.join(cp,file_name) else: jar_env = os.environ[env_var] jar_iter = ((os.path.join(jar_env, path_to_jar) for path_to_jar in os.listdir(jar_env)) if os.path.isdir(jar_env) else (jar_env,)) for path_to_jar in jar_iter: if os.path.isfile(path_to_jar): filename=os.path.basename(path_to_jar) if is_regex and re.match(name_pattern, filename) or \ (not is_regex and filename == name_pattern): if verbose: print('[Found %s: %s]' % (name_pattern, path_to_jar)) yielded = True yield path_to_jar # Check the path list. for directory in searchpath: if is_regex: for filename in os.listdir(directory): path_to_jar = os.path.join(directory, filename) if os.path.isfile(path_to_jar): if re.match(name_pattern, filename): if verbose: print('[Found %s: %s]' % (filename, path_to_jar)) yielded = True yield path_to_jar else: path_to_jar = os.path.join(directory, name_pattern) if os.path.isfile(path_to_jar): if verbose: print('[Found %s: %s]' % (name_pattern, path_to_jar)) yielded = True yield path_to_jar if not yielded: # If nothing was found, raise an error msg = ("NLTK was unable to find %s!" % name_pattern) if env_vars: msg += ' Set the %s environment variable' % env_vars[0] msg = textwrap.fill(msg+'.', initial_indent=' ', subsequent_indent=' ') if searchpath: msg += '\n\n Searched in:' msg += ''.join('\n - %s' % d for d in searchpath) if url: msg += ('\n\n For more information, on %s, see:\n <%s>' % (name_pattern, url)) div = '='*75 raise LookupError('\n\n%s\n%s\n%s' % (div, msg, div)) def find_jar(name_pattern, path_to_jar=None, env_vars=(), searchpath=(), url=None, verbose=True, is_regex=False): return next(find_jar_iter(name_pattern, path_to_jar, env_vars, searchpath, url, verbose, is_regex)) def find_jars_within_path(path_to_jars): return [os.path.join(root, filename) for root, dirnames, filenames in os.walk(path_to_jars) for filename in fnmatch.filter(filenames, '*.jar')] def _decode_stdoutdata(stdoutdata): """ Convert data read from stdout/stderr to unicode """ if not isinstance(stdoutdata, bytes): return stdoutdata encoding = getattr(sys.__stdout__, "encoding", locale.getpreferredencoding()) if encoding is None: return stdoutdata.decode() return stdoutdata.decode(encoding) ########################################################################## # Import Stdlib Module ########################################################################## def import_from_stdlib(module): """ When python is run from within the nltk/ directory tree, the current directory is included at the beginning of the search path. Unfortunately, that means that modules within nltk can sometimes shadow standard library modules. As an example, the stdlib 'inspect' module will attempt to import the stdlib 'tokenize' module, but will instead end up importing NLTK's 'tokenize' module instead (causing the import to fail). """ old_path = sys.path sys.path = [d for d in sys.path if d not in ('', '.')] m = __import__(module) sys.path = old_path return m ########################################################################## # Wrapper for ElementTree Elements ########################################################################## @compat.python_2_unicode_compatible class ElementWrapper(object): """ A wrapper around ElementTree Element objects whose main purpose is to provide nicer __repr__ and __str__ methods. In addition, any of the wrapped Element's methods that return other Element objects are overridden to wrap those values before returning them. This makes Elements more convenient to work with in interactive sessions and doctests, at the expense of some efficiency. """ # Prevent double-wrapping: def __new__(cls, etree): """ Create and return a wrapper around a given Element object. If ``etree`` is an ``ElementWrapper``, then ``etree`` is returned as-is. """ if isinstance(etree, ElementWrapper): return etree else: return object.__new__(ElementWrapper) def __init__(self, etree): r""" Initialize a new Element wrapper for ``etree``. If ``etree`` is a string, then it will be converted to an Element object using ``ElementTree.fromstring()`` first: >>> ElementWrapper("") \n"> """ if isinstance(etree, compat.string_types): etree = ElementTree.fromstring(etree) self.__dict__['_etree'] = etree def unwrap(self): """ Return the Element object wrapped by this wrapper. """ return self._etree ##//////////////////////////////////////////////////////////// #{ String Representation ##//////////////////////////////////////////////////////////// def __repr__(self): s = ElementTree.tostring(self._etree, encoding='utf8').decode('utf8') if len(s) > 60: e = s.rfind('<') if (len(s)-e) > 30: e = -20 s = '%s...%s' % (s[:30], s[e:]) return '' % s def __str__(self): """ :return: the result of applying ``ElementTree.tostring()`` to the wrapped Element object. """ return ElementTree.tostring(self._etree, encoding='utf8').decode('utf8').rstrip() ##//////////////////////////////////////////////////////////// #{ Element interface Delegation (pass-through) ##//////////////////////////////////////////////////////////// def __getattr__(self, attrib): return getattr(self._etree, attrib) def __setattr__(self, attr, value): return setattr(self._etree, attr, value) def __delattr__(self, attr): return delattr(self._etree, attr) def __setitem__(self, index, element): self._etree[index] = element def __delitem__(self, index): del self._etree[index] def __setslice__(self, start, stop, elements): self._etree[start:stop] = elements def __delslice__(self, start, stop): del self._etree[start:stop] def __len__(self): return len(self._etree) ##//////////////////////////////////////////////////////////// #{ Element interface Delegation (wrap result) ##//////////////////////////////////////////////////////////// def __getitem__(self, index): return ElementWrapper(self._etree[index]) def __getslice__(self, start, stop): return [ElementWrapper(elt) for elt in self._etree[start:stop]] def getchildren(self): return [ElementWrapper(elt) for elt in self._etree] def getiterator(self, tag=None): return (ElementWrapper(elt) for elt in self._etree.getiterator(tag)) def makeelement(self, tag, attrib): return ElementWrapper(self._etree.makeelement(tag, attrib)) def find(self, path): elt = self._etree.find(path) if elt is None: return elt else: return ElementWrapper(elt) def findall(self, path): return [ElementWrapper(elt) for elt in self._etree.findall(path)] ###################################################################### # Helper for Handling Slicing ###################################################################### def slice_bounds(sequence, slice_obj, allow_step=False): """ Given a slice, return the corresponding (start, stop) bounds, taking into account None indices and negative indices. The following guarantees are made for the returned start and stop values: - 0 <= start <= len(sequence) - 0 <= stop <= len(sequence) - start <= stop :raise ValueError: If ``slice_obj.step`` is not None. :param allow_step: If true, then the slice object may have a non-None step. If it does, then return a tuple (start, stop, step). """ start, stop = (slice_obj.start, slice_obj.stop) # If allow_step is true, then include the step in our return # value tuple. if allow_step: step = slice_obj.step if step is None: step = 1 # Use a recursive call without allow_step to find the slice # bounds. If step is negative, then the roles of start and # stop (in terms of default values, etc), are swapped. if step < 0: start, stop = slice_bounds(sequence, slice(stop, start)) else: start, stop = slice_bounds(sequence, slice(start, stop)) return start, stop, step # Otherwise, make sure that no non-default step value is used. elif slice_obj.step not in (None, 1): raise ValueError('slices with steps are not supported by %s' % sequence.__class__.__name__) # Supply default offsets. if start is None: start = 0 if stop is None: stop = len(sequence) # Handle negative indices. if start < 0: start = max(0, len(sequence)+start) if stop < 0: stop = max(0, len(sequence)+stop) # Make sure stop doesn't go past the end of the list. Note that # we avoid calculating len(sequence) if possible, because for lazy # sequences, calculating the length of a sequence can be expensive. if stop > 0: try: sequence[stop-1] except IndexError: stop = len(sequence) # Make sure start isn't past stop. start = min(start, stop) # That's all folks! return start, stop ###################################################################### # Permission Checking ###################################################################### def is_writable(path): # Ensure that it exists. if not os.path.exists(path): return False # If we're on a posix system, check its permissions. if hasattr(os, 'getuid'): statdata = os.stat(path) perm = stat.S_IMODE(statdata.st_mode) # is it world-writable? if (perm & 0o002): return True # do we own it? elif statdata.st_uid == os.getuid() and (perm & 0o200): return True # are we in a group that can write to it? elif (statdata.st_gid in [os.getgid()] + os.getgroups()) \ and (perm & 0o020): return True # otherwise, we can't write to it. else: return False # Otherwise, we'll assume it's writable. # [xx] should we do other checks on other platforms? return True ###################################################################### # NLTK Error reporting ###################################################################### def raise_unorderable_types(ordering, a, b): raise TypeError("unorderable types: %s() %s %s()" % (type(a).__name__, ordering, type(b).__name__)) nltk-3.1/nltk/jsontags.py0000644000076500000240000000364712607224144015242 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: JSON Encoder/Decoder Helpers # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Xu # # URL: # For license information, see LICENSE.TXT """ Register JSON tags, so the nltk data loader knows what module and class to look for. NLTK uses simple '!' tags to mark the types of objects, but the fully-qualified "tag:nltk.org,2011:" prefix is also accepted in case anyone ends up using it. """ import json json_tags = {} TAG_PREFIX = '!' def register_tag(cls): """ Decorates a class to register it's json tag. """ json_tags[TAG_PREFIX+getattr(cls, 'json_tag')] = cls return cls class JSONTaggedEncoder(json.JSONEncoder): def default(self, obj): obj_tag = getattr(obj, 'json_tag', None) if obj_tag is None: return super(JSONTaggedEncoder, self).default(obj) obj_tag = TAG_PREFIX + obj_tag obj = obj.encode_json_obj() return {obj_tag: obj} class JSONTaggedDecoder(json.JSONDecoder): def decode(self, s): return self.decode_obj(super(JSONTaggedDecoder, self).decode(s)) @classmethod def decode_obj(cls, obj): # Decode nested objects first. if isinstance(obj, dict): obj = dict((key, cls.decode_obj(val)) for (key, val) in obj.items()) elif isinstance(obj, list): obj = list(cls.decode_obj(val) for val in obj) # Check if we have a tagged object. if not isinstance(obj, dict) or len(obj) != 1: return obj obj_tag = next(iter(obj.keys())) if not obj_tag.startswith('!'): return obj if obj_tag not in json_tags: raise ValueError('Unknown tag', obj_tag) obj_cls = json_tags[obj_tag] return obj_cls.decode_json_obj(obj[obj_tag]) __all__ = ['register_tag', 'json_tags', 'JSONTaggedEncoder', 'JSONTaggedDecoder'] nltk-3.1/nltk/lazyimport.py0000644000076500000240000001102712574600335015616 0ustar sbstaff00000000000000# This module is from mx/DateTime/LazyModule.py and is # distributed under the terms of the eGenix.com Public License Agreement # http://www.egenix.com/products/eGenix.com-Public-License-1.1.0.pdf """ Helper to enable simple lazy module import. 'Lazy' means the actual import is deferred until an attribute is requested from the module's namespace. This has the advantage of allowing all imports to be done at the top of a script (in a prominent and visible place) without having a great impact on startup time. Copyright (c) 1999-2005, Marc-Andre Lemburg; mailto:mal@lemburg.com See the documentation for further information on copyrights, or contact the author. All Rights Reserved. """ from __future__ import print_function ### Constants _debug = 0 ### class LazyModule: """ Lazy module class. Lazy modules are imported into the given namespaces whenever a non-special attribute (there are some attributes like __doc__ that class instances handle without calling __getattr__) is requested. The module is then registered under the given name in locals usually replacing the import wrapper instance. The import itself is done using globals as global namespace. Example of creating a lazy load module: ISO = LazyModule('ISO',locals(),globals()) Later, requesting an attribute from ISO will load the module automatically into the locals() namespace, overriding the LazyModule instance: t = ISO.Week(1998,1,1) """ # Flag which inidicates whether the LazyModule is initialized or not __lazymodule_init = 0 # Name of the module to load __lazymodule_name = '' # Flag which indicates whether the module was loaded or not __lazymodule_loaded = 0 # Locals dictionary where to register the module __lazymodule_locals = None # Globals dictionary to use for the module import __lazymodule_globals = None def __init__(self, name, locals, globals=None): """ Create a LazyModule instance wrapping module name. The module will later on be registered in locals under the given module name. globals is optional and defaults to locals. """ self.__lazymodule_locals = locals if globals is None: globals = locals self.__lazymodule_globals = globals mainname = globals.get('__name__', '') if mainname: self.__name__ = mainname + '.' + name self.__lazymodule_name = name else: self.__name__ = self.__lazymodule_name = name self.__lazymodule_init = 1 def __lazymodule_import(self): """ Import the module now. """ # Load and register module name = self.__lazymodule_name if self.__lazymodule_loaded: return self.__lazymodule_locals[name] if _debug: print('LazyModule: Loading module %r' % name) self.__lazymodule_locals[name] \ = module \ = __import__(name, self.__lazymodule_locals, self.__lazymodule_globals, '*') # Fill namespace with all symbols from original module to # provide faster access. self.__dict__.update(module.__dict__) # Set import flag self.__dict__['__lazymodule_loaded'] = 1 if _debug: print('LazyModule: Module %r loaded' % name) return module def __getattr__(self, name): """ Import the module on demand and get the attribute. """ if self.__lazymodule_loaded: raise AttributeError(name) if _debug: print('LazyModule: ' \ 'Module load triggered by attribute %r read access' % name) module = self.__lazymodule_import() return getattr(module, name) def __setattr__(self, name, value): """ Import the module on demand and set the attribute. """ if not self.__lazymodule_init: self.__dict__[name] = value return if self.__lazymodule_loaded: self.__lazymodule_locals[self.__lazymodule_name] = value self.__dict__[name] = value return if _debug: print('LazyModule: ' \ 'Module load triggered by attribute %r write access' % name) module = self.__lazymodule_import() setattr(module, name, value) def __repr__(self): return "" % self.__name__ nltk-3.1/nltk/metrics/0000755000076500000240000000000012610001541014460 5ustar sbstaff00000000000000nltk-3.1/nltk/metrics/__init__.py0000644000076500000240000000242412607224144016607 0ustar sbstaff00000000000000# Natural Language Toolkit: Metrics # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT # """ NLTK Metrics Classes and methods for scoring processing modules. """ from nltk.metrics.scores import (accuracy, precision, recall, f_measure, log_likelihood, approxrand) from nltk.metrics.confusionmatrix import ConfusionMatrix from nltk.metrics.distance import (edit_distance, binary_distance, jaccard_distance, masi_distance, interval_distance, custom_distance, presence, fractional_presence) from nltk.metrics.paice import Paice from nltk.metrics.segmentation import windowdiff, ghd, pk from nltk.metrics.agreement import AnnotationTask from nltk.metrics.association import (NgramAssocMeasures, BigramAssocMeasures, TrigramAssocMeasures, ContingencyMeasures) from nltk.metrics.spearman import (spearman_correlation, ranks_from_sequence, ranks_from_scores) nltk-3.1/nltk/metrics/agreement.py0000644000076500000240000003522412607224144017023 0ustar sbstaff00000000000000# Natural Language Toolkit: Agreement Metrics # # Copyright (C) 2001-2015 NLTK Project # Author: Tom Lippincott # URL: # For license information, see LICENSE.TXT # """ Implementations of inter-annotator agreement coefficients surveyed by Artstein and Poesio (2007), Inter-Coder Agreement for Computational Linguistics. An agreement coefficient calculates the amount that annotators agreed on label assignments beyond what is expected by chance. In defining the AnnotationTask class, we use naming conventions similar to the paper's terminology. There are three types of objects in an annotation task: the coders (variables "c" and "C") the items to be annotated (variables "i" and "I") the potential categories to be assigned (variables "k" and "K") Additionally, it is often the case that we don't want to treat two different labels as complete disagreement, and so the AnnotationTask constructor can also take a distance metric as a final argument. Distance metrics are simply functions that take two arguments, and return a value between 0.0 and 1.0 indicating the distance between them. If not supplied, the default is binary comparison between the arguments. The simplest way to initialize an AnnotationTask is with a list of triples, each containing a coder's assignment for one object in the task: task = AnnotationTask(data=[('c1', '1', 'v1'),('c2', '1', 'v1'),...]) Note that the data list needs to contain the same number of triples for each individual coder, containing category values for the same set of items. Alpha (Krippendorff 1980) Kappa (Cohen 1960) S (Bennet, Albert and Goldstein 1954) Pi (Scott 1955) TODO: Describe handling of multiple coders and missing data Expected results from the Artstein and Poesio survey paper: >>> from nltk.metrics.agreement import AnnotationTask >>> import os.path >>> t = AnnotationTask(data=[x.split() for x in open(os.path.join(os.path.dirname(__file__), "artstein_poesio_example.txt"))]) >>> t.avg_Ao() 0.88 >>> t.pi() 0.7995322418977615... >>> t.S() 0.8199999999999998... This would have returned a wrong value (0.0) in @785fb79 as coders are in the wrong order. Subsequently, all values for pi(), S(), and kappa() would have been wrong as they are computed with avg_Ao(). >>> t2 = AnnotationTask(data=[('b','1','stat'),('a','1','stat')]) >>> t2.avg_Ao() 1.0 The following, of course, also works. >>> t3 = AnnotationTask(data=[('a','1','othr'),('b','1','othr')]) >>> t3.avg_Ao() 1.0 """ from __future__ import print_function, unicode_literals import logging from itertools import groupby from operator import itemgetter from nltk.probability import FreqDist, ConditionalFreqDist from nltk.internals import deprecated from nltk.compat import python_2_unicode_compatible, iteritems from nltk.metrics.distance import binary_distance log = logging.getLogger(__file__) @python_2_unicode_compatible class AnnotationTask(object): """Represents an annotation task, i.e. people assign labels to items. Notation tries to match notation in Artstein and Poesio (2007). In general, coders and items can be represented as any hashable object. Integers, for example, are fine, though strings are more readable. Labels must support the distance functions applied to them, so e.g. a string-edit-distance makes no sense if your labels are integers, whereas interval distance needs numeric values. A notable case of this is the MASI metric, which requires Python sets. """ def __init__(self, data=None, distance=binary_distance): """Initialize an empty annotation task. """ self.distance = distance self.I = set() self.K = set() self.C = set() self.data = [] if data is not None: self.load_array(data) def __str__(self): return "\r\n".join(map(lambda x:"%s\t%s\t%s" % (x['coder'], x['item'].replace('_', "\t"), ",".join(x['labels'])), self.data)) def load_array(self, array): """Load the results of annotation. The argument is a list of 3-tuples, each representing a coder's labeling of an item: (coder,item,label) """ for coder, item, labels in array: self.C.add(coder) self.K.add(labels) self.I.add(item) self.data.append({'coder':coder, 'labels':labels, 'item':item}) def agr(self, cA, cB, i, data=None): """Agreement between two coders on a given item """ data = data or self.data # cfedermann: we don't know what combination of coder/item will come # first in x; to avoid StopIteration problems due to assuming an order # cA,cB, we allow either for k1 and then look up the missing as k2. k1 = next((x for x in data if x['coder'] in (cA,cB) and x['item']==i)) if k1['coder'] == cA: k2 = next((x for x in data if x['coder']==cB and x['item']==i)) else: k2 = next((x for x in data if x['coder']==cA and x['item']==i)) ret = 1.0 - float(self.distance(k1['labels'], k2['labels'])) log.debug("Observed agreement between %s and %s on %s: %f", cA, cB, i, ret) log.debug("Distance between \"%r\" and \"%r\": %f", k1['labels'], k2['labels'], 1.0 - ret) return ret def Nk(self, k): return float(sum(1 for x in self.data if x['labels'] == k)) def Nik(self, i, k): return float(sum(1 for x in self.data if x['item'] == i and x['labels'] == k)) def Nck(self, c, k): return float(sum(1 for x in self.data if x['coder'] == c and x['labels'] == k)) @deprecated('Use Nk, Nik or Nck instead') def N(self, k=None, i=None, c=None): """Implements the "n-notation" used in Artstein and Poesio (2007) """ if k is not None and i is None and c is None: ret = self.Nk(k) elif k is not None and i is not None and c is None: ret = self.Nik(i, k) elif k is not None and c is not None and i is None: ret = self.Nck(c, k) else: raise ValueError("You must pass either i or c, not both! (k=%r,i=%r,c=%r)" % (k, i, c)) log.debug("Count on N[%s,%s,%s]: %d", k, i, c, ret) return ret def _grouped_data(self, field, data=None): data = data or self.data return groupby(sorted(data, key=itemgetter(field)), itemgetter(field)) def Ao(self, cA, cB): """Observed agreement between two coders on all items. """ data = self._grouped_data('item', (x for x in self.data if x['coder'] in (cA, cB))) ret = float(sum(self.agr(cA, cB, item, item_data) for item, item_data in data)) / float(len(self.I)) log.debug("Observed agreement between %s and %s: %f", cA, cB, ret) return ret def _pairwise_average(self, function): """ Calculates the average of function results for each coder pair """ total = 0 n = 0 s = self.C.copy() for cA in self.C: s.remove(cA) for cB in s: total += function(cA, cB) n += 1 ret = total / n return ret def avg_Ao(self): """Average observed agreement across all coders and items. """ ret = self._pairwise_average(self.Ao) log.debug("Average observed agreement: %f", ret) return ret def Do_alpha(self): """The observed disagreement for the alpha coefficient. The alpha coefficient, unlike the other metrics, uses this rather than observed agreement. """ total = 0.0 for i, itemdata in self._grouped_data('item'): label_freqs = FreqDist(x['labels'] for x in itemdata) for j, nj in iteritems(label_freqs): for l, nl in iteritems(label_freqs): total += float(nj * nl) * self.distance(l, j) ret = (1.0 / float((len(self.I) * len(self.C) * (len(self.C) - 1)))) * total log.debug("Observed disagreement: %f", ret) return ret def Do_Kw_pairwise(self,cA,cB,max_distance=1.0): """The observed disagreement for the weighted kappa coefficient. """ total = 0.0 data = (x for x in self.data if x['coder'] in (cA, cB)) for i, itemdata in self._grouped_data('item', data): # we should have two items; distance doesn't care which comes first total += self.distance(next(itemdata)['labels'], next(itemdata)['labels']) ret = total / (len(self.I) * max_distance) log.debug("Observed disagreement between %s and %s: %f", cA, cB, ret) return ret def Do_Kw(self, max_distance=1.0): """Averaged over all labelers """ ret = self._pairwise_average(lambda cA, cB: self.Do_Kw_pairwise(cA, cB, max_distance)) log.debug("Observed disagreement: %f", ret) return ret # Agreement Coefficients def S(self): """Bennett, Albert and Goldstein 1954 """ Ae = 1.0 / float(len(self.K)) ret = (self.avg_Ao() - Ae) / (1.0 - Ae) return ret def pi(self): """Scott 1955; here, multi-pi. Equivalent to K from Siegel and Castellan (1988). """ total = 0.0 label_freqs = FreqDist(x['labels'] for x in self.data) for k, f in iteritems(label_freqs): total += f ** 2 Ae = total / float((len(self.I) * len(self.C)) ** 2) return (self.avg_Ao() - Ae) / (1 - Ae) def Ae_kappa(self, cA, cB): Ae = 0.0 nitems = float(len(self.I)) label_freqs = ConditionalFreqDist((x['labels'], x['coder']) for x in self.data) for k in label_freqs.conditions(): Ae += (label_freqs[k][cA] / nitems) * (label_freqs[k][cB] / nitems) return Ae def kappa_pairwise(self, cA, cB): """ """ Ae = self.Ae_kappa(cA, cB) ret = (self.Ao(cA, cB) - Ae) / (1.0 - Ae) log.debug("Expected agreement between %s and %s: %f", cA, cB, Ae) return ret def kappa(self): """Cohen 1960 Averages naively over kappas for each coder pair. """ return self._pairwise_average(self.kappa_pairwise) def multi_kappa(self): """Davies and Fleiss 1982 Averages over observed and expected agreements for each coder pair. """ Ae = self._pairwise_average(self.Ae_kappa) return (self.avg_Ao() - Ae) / (1.0 - Ae) def alpha(self): """Krippendorff 1980 """ De = 0.0 label_freqs = FreqDist(x['labels'] for x in self.data) for j in self.K: nj = label_freqs[j] for l in self.K: De += float(nj * label_freqs[l]) * self.distance(j, l) De = (1.0 / (len(self.I) * len(self.C) * (len(self.I) * len(self.C) - 1))) * De log.debug("Expected disagreement: %f", De) ret = 1.0 - (self.Do_alpha() / De) return ret def weighted_kappa_pairwise(self, cA, cB, max_distance=1.0): """Cohen 1968 """ total = 0.0 label_freqs = ConditionalFreqDist((x['coder'], x['labels']) for x in self.data if x['coder'] in (cA, cB)) for j in self.K: for l in self.K: total += label_freqs[cA][j] * label_freqs[cB][l] * self.distance(j, l) De = total / (max_distance * pow(len(self.I), 2)) log.debug("Expected disagreement between %s and %s: %f", cA, cB, De) Do = self.Do_Kw_pairwise(cA, cB) ret = 1.0 - (Do / De) return ret def weighted_kappa(self, max_distance=1.0): """Cohen 1968 """ return self._pairwise_average(lambda cA, cB: self.weighted_kappa_pairwise(cA, cB, max_distance)) if __name__ == '__main__': import re import optparse from nltk.metrics import distance # process command-line arguments parser = optparse.OptionParser() parser.add_option("-d", "--distance", dest="distance", default="binary_distance", help="distance metric to use") parser.add_option("-a", "--agreement", dest="agreement", default="kappa", help="agreement coefficient to calculate") parser.add_option("-e", "--exclude", dest="exclude", action="append", default=[], help="coder names to exclude (may be specified multiple times)") parser.add_option("-i", "--include", dest="include", action="append", default=[], help="coder names to include, same format as exclude") parser.add_option("-f", "--file", dest="file", help="file to read labelings from, each line with three columns: 'labeler item labels'") parser.add_option("-v", "--verbose", dest="verbose", default='0', help="how much debugging to print on stderr (0-4)") parser.add_option("-c", "--columnsep", dest="columnsep", default="\t", help="char/string that separates the three columns in the file, defaults to tab") parser.add_option("-l", "--labelsep", dest="labelsep", default=",", help="char/string that separates labels (if labelers can assign more than one), defaults to comma") parser.add_option("-p", "--presence", dest="presence", default=None, help="convert each labeling into 1 or 0, based on presence of LABEL") parser.add_option("-T", "--thorough", dest="thorough", default=False, action="store_true", help="calculate agreement for every subset of the annotators") (options, remainder) = parser.parse_args() if not options.file: parser.print_help() exit() logging.basicConfig(level=50 - 10 * int(options.verbose)) # read in data from the specified file data = [] with open(options.file, 'r') as infile: for l in infile: toks = l.split(options.columnsep) coder, object_, labels = toks[0], str(toks[1:-1]), frozenset(toks[-1].strip().split(options.labelsep)) if ((options.include == options.exclude) or (len(options.include) > 0 and coder in options.include) or (len(options.exclude) > 0 and coder not in options.exclude)): data.append((coder, object_, labels)) if options.presence: task = AnnotationTask(data, getattr(distance, options.distance)(options.presence)) else: task = AnnotationTask(data, getattr(distance, options.distance)) if options.thorough: pass else: print(getattr(task, options.agreement)()) logging.shutdown() nltk-3.1/nltk/metrics/association.py0000644000076500000240000003621412607224144017370 0ustar sbstaff00000000000000# Natural Language Toolkit: Ngram Association Measures # # Copyright (C) 2001-2015 NLTK Project # Author: Joel Nothman # URL: # For license information, see LICENSE.TXT """ Provides scoring functions for a number of association measures through a generic, abstract implementation in ``NgramAssocMeasures``, and n-specific ``BigramAssocMeasures`` and ``TrigramAssocMeasures``. """ import math as _math from functools import reduce _log2 = lambda x: _math.log(x, 2.0) _ln = _math.log _product = lambda s: reduce(lambda x, y: x * y, s) _SMALL = 1e-20 try: from scipy.stats import fisher_exact except ImportError: def fisher_exact(*_args, **_kwargs): raise NotImplementedError ### Indices to marginals arguments: NGRAM = 0 """Marginals index for the ngram count""" UNIGRAMS = -2 """Marginals index for a tuple of each unigram count""" TOTAL = -1 """Marginals index for the number of words in the data""" class NgramAssocMeasures(object): """ An abstract class defining a collection of generic association measures. Each public method returns a score, taking the following arguments:: score_fn(count_of_ngram, (count_of_n-1gram_1, ..., count_of_n-1gram_j), (count_of_n-2gram_1, ..., count_of_n-2gram_k), ..., (count_of_1gram_1, ..., count_of_1gram_n), count_of_total_words) See ``BigramAssocMeasures`` and ``TrigramAssocMeasures`` Inheriting classes should define a property _n, and a method _contingency which calculates contingency values from marginals in order for all association measures defined here to be usable. """ _n = 0 @staticmethod def _contingency(*marginals): """Calculates values of a contingency table from marginal values.""" raise NotImplementedError("The contingency table is not available" "in the general ngram case") @staticmethod def _marginals(*contingency): """Calculates values of contingency table marginals from its values.""" raise NotImplementedError("The contingency table is not available" "in the general ngram case") @classmethod def _expected_values(cls, cont): """Calculates expected values for a contingency table.""" n_all = sum(cont) bits = [1 << i for i in range(cls._n)] # For each contingency table cell for i in range(len(cont)): # Yield the expected value yield (_product(sum(cont[x] for x in range(2 ** cls._n) if (x & j) == (i & j)) for j in bits) / float(n_all ** (cls._n - 1))) @staticmethod def raw_freq(*marginals): """Scores ngrams by their frequency""" return float(marginals[NGRAM]) / marginals[TOTAL] @classmethod def student_t(cls, *marginals): """Scores ngrams using Student's t test with independence hypothesis for unigrams, as in Manning and Schutze 5.3.1. """ return ((marginals[NGRAM] - _product(marginals[UNIGRAMS]) / float(marginals[TOTAL] ** (cls._n - 1))) / (marginals[NGRAM] + _SMALL) ** .5) @classmethod def chi_sq(cls, *marginals): """Scores ngrams using Pearson's chi-square as in Manning and Schutze 5.3.3. """ cont = cls._contingency(*marginals) exps = cls._expected_values(cont) return sum((obs - exp) ** 2 / (exp + _SMALL) for obs, exp in zip(cont, exps)) @staticmethod def mi_like(*marginals, **kwargs): """Scores ngrams using a variant of mutual information. The keyword argument power sets an exponent (default 3) for the numerator. No logarithm of the result is calculated. """ return (marginals[NGRAM] ** kwargs.get('power', 3) / float(_product(marginals[UNIGRAMS]))) @classmethod def pmi(cls, *marginals): """Scores ngrams by pointwise mutual information, as in Manning and Schutze 5.4. """ return (_log2(marginals[NGRAM] * marginals[TOTAL] ** (cls._n - 1)) - _log2(_product(marginals[UNIGRAMS]))) @classmethod def likelihood_ratio(cls, *marginals): """Scores ngrams using likelihood ratios as in Manning and Schutze 5.3.4. """ cont = cls._contingency(*marginals) return (cls._n * sum(obs * _ln(float(obs) / (exp + _SMALL) + _SMALL) for obs, exp in zip(cont, cls._expected_values(cont)))) @classmethod def poisson_stirling(cls, *marginals): """Scores ngrams using the Poisson-Stirling measure.""" exp = (_product(marginals[UNIGRAMS]) / float(marginals[TOTAL] ** (cls._n - 1))) return marginals[NGRAM] * (_log2(marginals[NGRAM] / exp) - 1) @classmethod def jaccard(cls, *marginals): """Scores ngrams using the Jaccard index.""" cont = cls._contingency(*marginals) return float(cont[0]) / sum(cont[:-1]) class BigramAssocMeasures(NgramAssocMeasures): """ A collection of bigram association measures. Each association measure is provided as a function with three arguments:: bigram_score_fn(n_ii, (n_ix, n_xi), n_xx) The arguments constitute the marginals of a contingency table, counting the occurrences of particular events in a corpus. The letter i in the suffix refers to the appearance of the word in question, while x indicates the appearance of any word. Thus, for example: n_ii counts (w1, w2), i.e. the bigram being scored n_ix counts (w1, *) n_xi counts (*, w2) n_xx counts (*, *), i.e. any bigram This may be shown with respect to a contingency table:: w1 ~w1 ------ ------ w2 | n_ii | n_oi | = n_xi ------ ------ ~w2 | n_io | n_oo | ------ ------ = n_ix TOTAL = n_xx """ _n = 2 @staticmethod def _contingency(n_ii, n_ix_xi_tuple, n_xx): """Calculates values of a bigram contingency table from marginal values.""" (n_ix, n_xi) = n_ix_xi_tuple n_oi = n_xi - n_ii n_io = n_ix - n_ii return (n_ii, n_oi, n_io, n_xx - n_ii - n_oi - n_io) @staticmethod def _marginals(n_ii, n_oi, n_io, n_oo): """Calculates values of contingency table marginals from its values.""" return (n_ii, (n_oi + n_ii, n_io + n_ii), n_oo + n_oi + n_io + n_ii) @staticmethod def _expected_values(cont): """Calculates expected values for a contingency table.""" n_xx = sum(cont) # For each contingency table cell for i in range(4): yield (cont[i] + cont[i ^ 1]) * (cont[i] + cont[i ^ 2]) / float(n_xx) @classmethod def phi_sq(cls, *marginals): """Scores bigrams using phi-square, the square of the Pearson correlation coefficient. """ n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) return (float((n_ii*n_oo - n_io*n_oi)**2) / ((n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo))) @classmethod def chi_sq(cls, n_ii, n_ix_xi_tuple, n_xx): """Scores bigrams using chi-square, i.e. phi-sq multiplied by the number of bigrams, as in Manning and Schutze 5.3.3. """ (n_ix, n_xi) = n_ix_xi_tuple return n_xx * cls.phi_sq(n_ii, (n_ix, n_xi), n_xx) @classmethod def fisher(cls, *marginals): """Scores bigrams using Fisher's Exact Test (Pedersen 1996). Less sensitive to small counts than PMI or Chi Sq, but also more expensive to compute. Requires scipy. """ n_ii, n_io, n_oi, n_oo = cls._contingency(*marginals) (odds, pvalue) = fisher_exact([[n_ii, n_io], [n_oi, n_oo]], alternative='less') return pvalue @staticmethod def dice(n_ii, n_ix_xi_tuple, n_xx): """Scores bigrams using Dice's coefficient.""" (n_ix, n_xi) = n_ix_xi_tuple return 2 * float(n_ii) / (n_ix + n_xi) class TrigramAssocMeasures(NgramAssocMeasures): """ A collection of trigram association measures. Each association measure is provided as a function with four arguments:: trigram_score_fn(n_iii, (n_iix, n_ixi, n_xii), (n_ixx, n_xix, n_xxi), n_xxx) The arguments constitute the marginals of a contingency table, counting the occurrences of particular events in a corpus. The letter i in the suffix refers to the appearance of the word in question, while x indicates the appearance of any word. Thus, for example: n_iii counts (w1, w2, w3), i.e. the trigram being scored n_ixx counts (w1, *, *) n_xxx counts (*, *, *), i.e. any trigram """ _n = 3 @staticmethod def _contingency(n_iii, n_iix_tuple, n_ixx_tuple, n_xxx): """Calculates values of a trigram contingency table (or cube) from marginal values. >>> TrigramAssocMeasures._contingency(1, (1, 1, 1), (1, 73, 1), 2000) (1, 0, 0, 0, 0, 72, 0, 1927) """ (n_iix, n_ixi, n_xii) = n_iix_tuple (n_ixx, n_xix, n_xxi) = n_ixx_tuple n_oii = n_xii - n_iii n_ioi = n_ixi - n_iii n_iio = n_iix - n_iii n_ooi = n_xxi - n_iii - n_oii - n_ioi n_oio = n_xix - n_iii - n_oii - n_iio n_ioo = n_ixx - n_iii - n_ioi - n_iio n_ooo = n_xxx - n_iii - n_oii - n_ioi - n_iio - n_ooi - n_oio - n_ioo return (n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo) @staticmethod def _marginals(*contingency): """Calculates values of contingency table marginals from its values. >>> TrigramAssocMeasures._marginals(1, 0, 0, 0, 0, 72, 0, 1927) (1, (1, 1, 1), (1, 73, 1), 2000) """ n_iii, n_oii, n_ioi, n_ooi, n_iio, n_oio, n_ioo, n_ooo = contingency return (n_iii, (n_iii + n_iio, n_iii + n_ioi, n_iii + n_oii), (n_iii + n_ioi + n_iio + n_ioo, n_iii + n_oii + n_iio + n_oio, n_iii + n_oii + n_ioi + n_ooi), sum(contingency)) class QuadgramAssocMeasures(NgramAssocMeasures): """ A collection of quadgram association measures. Each association measure is provided as a function with five arguments:: trigram_score_fn(n_iiii, (n_iiix, n_iixi, n_ixii, n_xiii), (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), (n_ixxx, n_xixx, n_xxix, n_xxxi), n_all) The arguments constitute the marginals of a contingency table, counting the occurrences of particular events in a corpus. The letter i in the suffix refers to the appearance of the word in question, while x indicates the appearance of any word. Thus, for example: n_iiii counts (w1, w2, w3, w4), i.e. the quadgram being scored n_ixxi counts (w1, *, *, w4) n_xxxx counts (*, *, *, *), i.e. any quadgram """ _n = 4 @staticmethod def _contingency(n_iiii, n_iiix_tuple, n_iixx_tuple, n_ixxx_tuple, n_xxxx): """Calculates values of a quadgram contingency table from marginal values. """ (n_iiix, n_iixi, n_ixii, n_xiii) = n_iiix_tuple (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix) = n_iixx_tuple (n_ixxx, n_xixx, n_xxix, n_xxxi) = n_ixxx_tuple n_oiii = n_xiii - n_iiii n_ioii = n_ixii - n_iiii n_iioi = n_iixi - n_iiii n_ooii = n_xxii - n_iiii - n_oiii - n_ioii n_oioi = n_xixi - n_iiii - n_oiii - n_iioi n_iooi = n_ixxi - n_iiii - n_ioii - n_iioi n_oooi = n_xxxi - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_iooi - n_oioi n_iiio = n_iiix - n_iiii n_oiio = n_xiix - n_iiii - n_oiii - n_iiio n_ioio = n_ixix - n_iiii - n_ioii - n_iiio n_ooio = n_xxix - n_iiii - n_oiii - n_ioii - n_iiio - n_ooii - n_ioio - n_oiio n_iioo = n_iixx - n_iiii - n_iioi - n_iiio n_oioo = n_xixx - n_iiii - n_oiii - n_iioi - n_iiio - n_oioi - n_oiio - n_iioo n_iooo = n_ixxx - n_iiii - n_ioii - n_iioi - n_iiio - n_iooi - n_iioo - n_ioio n_oooo = n_xxxx - n_iiii - n_oiii - n_ioii - n_iioi - n_ooii - n_oioi - n_iooi - \ n_oooi - n_iiio - n_oiio - n_ioio - n_ooio - n_iioo - n_oioo - n_iooo return (n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, n_iioo, n_oioo, n_iooo, n_oooo) @staticmethod def _marginals(*contingency): """Calculates values of contingency table marginals from its values. QuadgramAssocMeasures._marginals(1, 0, 2, 46, 552, 825, 2577, 34967, 1, 0, 2, 48, 7250, 9031, 28585, 356653) (1, (2, 553, 3, 1), (7804, 6, 3132, 1378, 49, 2), (38970, 17660, 100, 38970), 440540) """ n_iiii, n_oiii, n_ioii, n_ooii, n_iioi, n_oioi, n_iooi, n_oooi, n_iiio, n_oiio, n_ioio, n_ooio, \ n_iioo, n_oioo, n_iooo, n_oooo = contingency n_iiix = n_iiii + n_iiio n_iixi = n_iiii + n_iioi n_ixii = n_iiii + n_ioii n_xiii = n_iiii + n_oiii n_iixx = n_iiii + n_iioi + n_iiio + n_iioo n_ixix = n_iiii + n_ioii + n_iiio + n_ioio n_ixxi = n_iiii + n_ioii + n_iioi + n_iooi n_xixi = n_iiii + n_oiii + n_iioi + n_oioi n_xxii = n_iiii + n_oiii + n_ioii + n_ooii n_xiix = n_iiii + n_oiii + n_iiio + n_oiio n_ixxx = n_iiii + n_ioii + n_iioi + n_iiio + n_iooi + n_iioo + n_ioio + n_iooo n_xixx = n_iiii + n_oiii + n_iioi + n_iiio + n_oioi + n_oiio + n_iioo + n_oioo n_xxix = n_iiii + n_oiii + n_ioii + n_iiio + n_ooii + n_ioio + n_oiio + n_ooio n_xxxi = n_iiii + n_oiii + n_ioii + n_iioi + n_ooii + n_iooi + n_oioi + n_oooi n_all = sum(contingency) return (n_iiii, (n_iiix, n_iixi, n_ixii, n_xiii), (n_iixx, n_ixix, n_ixxi, n_xixi, n_xxii, n_xiix), (n_ixxx, n_xixx, n_xxix, n_xxxi), n_all) class ContingencyMeasures(object): """Wraps NgramAssocMeasures classes such that the arguments of association measures are contingency table values rather than marginals. """ def __init__(self, measures): """Constructs a ContingencyMeasures given a NgramAssocMeasures class""" self.__class__.__name__ = 'Contingency' + measures.__class__.__name__ for k in dir(measures): if k.startswith('__'): continue v = getattr(measures, k) if not k.startswith('_'): v = self._make_contingency_fn(measures, v) setattr(self, k, v) @staticmethod def _make_contingency_fn(measures, old_fn): """From an association measure function, produces a new function which accepts contingency table values as its arguments. """ def res(*contingency): return old_fn(*measures._marginals(*contingency)) res.__doc__ = old_fn.__doc__ res.__name__ = old_fn.__name__ return res nltk-3.1/nltk/metrics/confusionmatrix.py0000644000076500000240000001670312607224144020305 0ustar sbstaff00000000000000# Natural Language Toolkit: Confusion Matrices # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals from nltk.probability import FreqDist from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class ConfusionMatrix(object): """ The confusion matrix between a list of reference values and a corresponding list of test values. Entry *[r,t]* of this matrix is a count of the number of times that the reference value *r* corresponds to the test value *t*. E.g.: >>> from nltk.metrics import ConfusionMatrix >>> ref = 'DET NN VB DET JJ NN NN IN DET NN'.split() >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() >>> cm = ConfusionMatrix(ref, test) >>> print(cm['NN', 'NN']) 3 Note that the diagonal entries *Ri=Tj* of this matrix corresponds to correct values; and the off-diagonal entries correspond to incorrect values. """ def __init__(self, reference, test, sort_by_count=False): """ Construct a new confusion matrix from a list of reference values and a corresponding list of test values. :type reference: list :param reference: An ordered list of reference values. :type test: list :param test: A list of values to compare against the corresponding reference values. :raise ValueError: If ``reference`` and ``length`` do not have the same length. """ if len(reference) != len(test): raise ValueError('Lists must have the same length.') # Get a list of all values. if sort_by_count: ref_fdist = FreqDist(reference) test_fdist = FreqDist(test) def key(v): return -(ref_fdist[v]+test_fdist[v]) values = sorted(set(reference+test), key=key) else: values = sorted(set(reference+test)) # Construct a value->index dictionary indices = dict((val,i) for (i,val) in enumerate(values)) # Make a confusion matrix table. confusion = [[0 for val in values] for val in values] max_conf = 0 # Maximum confusion for w,g in zip(reference, test): confusion[indices[w]][indices[g]] += 1 max_conf = max(max_conf, confusion[indices[w]][indices[g]]) #: A list of all values in ``reference`` or ``test``. self._values = values #: A dictionary mapping values in ``self._values`` to their indices. self._indices = indices #: The confusion matrix itself (as a list of lists of counts). self._confusion = confusion #: The greatest count in ``self._confusion`` (used for printing). self._max_conf = max_conf #: The total number of values in the confusion matrix. self._total = len(reference) #: The number of correct (on-diagonal) values in the matrix. self._correct = sum(confusion[i][i] for i in range(len(values))) def __getitem__(self, li_lj_tuple): """ :return: The number of times that value ``li`` was expected and value ``lj`` was given. :rtype: int """ (li, lj) = li_lj_tuple i = self._indices[li] j = self._indices[lj] return self._confusion[i][j] def __repr__(self): return '' % (self._correct, self._total) def __str__(self): return self.pretty_format() def pretty_format(self, show_percents=False, values_in_chart=True, truncate=None, sort_by_count=False): """ :return: A multi-line string representation of this confusion matrix. :type truncate: int :param truncate: If specified, then only show the specified number of values. Any sorting (e.g., sort_by_count) will be performed before truncation. :param sort_by_count: If true, then sort by the count of each label in the reference data. I.e., labels that occur more frequently in the reference label will be towards the left edge of the matrix, and labels that occur less frequently will be towards the right edge. @todo: add marginals? """ confusion = self._confusion values = self._values if sort_by_count: values = sorted(values, key=lambda v: -sum(self._confusion[self._indices[v]])) if truncate: values = values[:truncate] if values_in_chart: value_strings = ["%s" % val for val in values] else: value_strings = [str(n+1) for n in range(len(values))] # Construct a format string for row values valuelen = max(len(val) for val in value_strings) value_format = '%' + repr(valuelen) + 's | ' # Construct a format string for matrix entries if show_percents: entrylen = 6 entry_format = '%5.1f%%' zerostr = ' .' else: entrylen = len(repr(self._max_conf)) entry_format = '%' + repr(entrylen) + 'd' zerostr = ' '*(entrylen-1) + '.' # Write the column values. s = '' for i in range(valuelen): s += (' '*valuelen)+' |' for val in value_strings: if i >= valuelen-len(val): s += val[i-valuelen+len(val)].rjust(entrylen+1) else: s += ' '*(entrylen+1) s += ' |\n' # Write a dividing line s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values))) # Write the entries. for val, li in zip(value_strings, values): i = self._indices[li] s += value_format % val for lj in values: j = self._indices[lj] if confusion[i][j] == 0: s += zerostr elif show_percents: s += entry_format % (100.0*confusion[i][j]/self._total) else: s += entry_format % confusion[i][j] if i == j: prevspace = s.rfind(' ') s = s[:prevspace] + '<' + s[prevspace+1:] + '>' else: s += ' ' s += '|\n' # Write a dividing line s += '%s-+-%s+\n' % ('-'*valuelen, '-'*((entrylen+1)*len(values))) # Write a key s += '(row = reference; col = test)\n' if not values_in_chart: s += 'Value key:\n' for i, value in enumerate(values): s += '%6d: %s\n' % (i+1, value) return s def key(self): values = self._values str = 'Value key:\n' indexlen = len(repr(len(values)-1)) key_format = ' %'+repr(indexlen)+'d: %s\n' for i in range(len(values)): str += key_format % (i, values[i]) return str def demo(): reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() test = 'DET VB VB DET NN NN NN IN DET NN'.split() print('Reference =', reference) print('Test =', test) print('Confusion matrix:') print(ConfusionMatrix(reference, test)) print(ConfusionMatrix(reference, test).pretty_format(sort_by_count=True)) if __name__ == '__main__': demo() nltk-3.1/nltk/metrics/distance.py0000644000076500000240000001346712607224144016653 0ustar sbstaff00000000000000# Natural Language Toolkit: Distance Metrics # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # Tom Lippincott # URL: # For license information, see LICENSE.TXT # """ Distance Metrics. Compute the distance between two items (usually strings). As metrics, they must satisfy the following three requirements: 1. d(a, a) = 0 2. d(a, b) >= 0 3. d(a, c) <= d(a, b) + d(b, c) """ from __future__ import print_function def _edit_dist_init(len1, len2): lev = [] for i in range(len1): lev.append([0] * len2) # initialize 2D array to zero for i in range(len1): lev[i][0] = i # column 0: 0,1,2,3,4,... for j in range(len2): lev[0][j] = j # row 0: 0,1,2,3,4,... return lev def _edit_dist_step(lev, i, j, s1, s2, transpositions=False): c1 = s1[i - 1] c2 = s2[j - 1] # skipping a character in s1 a = lev[i - 1][j] + 1 # skipping a character in s2 b = lev[i][j - 1] + 1 # substitution c = lev[i - 1][j - 1] + (c1 != c2) # transposition d = c + 1 # never picked by default if transpositions and i > 1 and j > 1: if s1[i - 2] == c2 and s2[j - 2] == c1: d = lev[i - 2][j - 2] + 1 # pick the cheapest lev[i][j] = min(a, b, c, d) def edit_distance(s1, s2, transpositions=False): """ Calculate the Levenshtein edit-distance between two strings. The edit distance is the number of characters that need to be substituted, inserted, or deleted, to transform s1 into s2. For example, transforming "rain" to "shine" requires three steps, consisting of two substitutions and one insertion: "rain" -> "sain" -> "shin" -> "shine". These operations could have been done in other orders, but at least three steps are needed. This also optionally allows transposition edits (e.g., "ab" -> "ba"), though this is disabled by default. :param s1, s2: The strings to be analysed :param transpositions: Whether to allow transposition edits :type s1: str :type s2: str :type transpositions: bool :rtype int """ # set up a 2-D array len1 = len(s1) len2 = len(s2) lev = _edit_dist_init(len1 + 1, len2 + 1) # iterate over the array for i in range(len1): for j in range(len2): _edit_dist_step(lev, i + 1, j + 1, s1, s2, transpositions=transpositions) return lev[len1][len2] def binary_distance(label1, label2): """Simple equality test. 0.0 if the labels are identical, 1.0 if they are different. >>> from nltk.metrics import binary_distance >>> binary_distance(1,1) 0.0 >>> binary_distance(1,3) 1.0 """ return 0.0 if label1 == label2 else 1.0 def jaccard_distance(label1, label2): """Distance metric comparing set-similarity. """ return (len(label1.union(label2)) - len(label1.intersection(label2)))/float(len(label1.union(label2))) def masi_distance(label1, label2): """Distance metric that takes into account partial agreement when multiple labels are assigned. >>> from nltk.metrics import masi_distance >>> masi_distance(set([1, 2]), set([1, 2, 3, 4])) 0.665... Passonneau 2006, Measuring Agreement on Set-Valued Items (MASI) for Semantic and Pragmatic Annotation. """ len_intersection = len(label1.intersection(label2)) len_union = len(label1.union(label2)) len_label1 = len(label1) len_label2 = len(label2) if len_label1 == len_label2 and len_label1 == len_intersection: m = 1 elif len_intersection == min(len_label1, len_label2): m = 0.67 elif len_intersection > 0: m = 0.33 else: m = 0 return 1 - (len_intersection / float(len_union)) * m def interval_distance(label1,label2): """Krippendorff's interval distance metric >>> from nltk.metrics import interval_distance >>> interval_distance(1,10) 81 Krippendorff 1980, Content Analysis: An Introduction to its Methodology """ try: return pow(label1 - label2, 2) # return pow(list(label1)[0]-list(label2)[0],2) except: print("non-numeric labels not supported with interval distance") def presence(label): """Higher-order function to test presence of a given label """ return lambda x, y: 1.0 * ((label in x) == (label in y)) def fractional_presence(label): return lambda x, y:\ abs((float(1.0 / len(x)) - float(1.0 / len(y)))) * (label in x and label in y) \ or 0.0 * (label not in x and label not in y) \ or abs(float(1.0 / len(x))) * (label in x and label not in y) \ or (float(1.0 / len(y))) * (label not in x and label in y) def custom_distance(file): data = {} with open(file, 'r') as infile: for l in infile: labelA, labelB, dist = l.strip().split("\t") labelA = frozenset([labelA]) labelB = frozenset([labelB]) data[frozenset([labelA,labelB])] = float(dist) return lambda x,y:data[frozenset([x,y])] def demo(): edit_distance_examples = [ ("rain", "shine"), ("abcdef", "acbdef"), ("language", "lnaguaeg"), ("language", "lnaugage"), ("language", "lngauage")] for s1, s2 in edit_distance_examples: print("Edit distance between '%s' and '%s':" % (s1, s2), edit_distance(s1, s2)) for s1, s2 in edit_distance_examples: print("Edit distance with transpositions between '%s' and '%s':" % (s1, s2), edit_distance(s1, s2, transpositions=True)) s1 = set([1, 2, 3, 4]) s2 = set([3, 4, 5]) print("s1:", s1) print("s2:", s2) print("Binary distance:", binary_distance(s1, s2)) print("Jaccard distance:", jaccard_distance(s1, s2)) print("MASI distance:", masi_distance(s1, s2)) if __name__ == '__main__': demo() nltk-3.1/nltk/metrics/paice.py0000644000076500000240000003346612607224144016143 0ustar sbstaff00000000000000# Natural Language Toolkit: Agreement Metrics # # Copyright (C) 2001-2015 NLTK Project # Author: Lauri Hallila # URL: # For license information, see LICENSE.TXT # """Counts Paice's performance statistics for evaluating stemming algorithms. What is required: - A dictionary of words grouped by their real lemmas - A dictionary of words grouped by stems from a stemming algorithm When these are given, Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) and Error-rate relative to truncation (ERRT) are counted. References: Chris D. Paice (1994). An evaluation method for stemming algorithms. In Proceedings of SIGIR, 42--50. """ from math import sqrt def get_words_from_dictionary(lemmas): ''' Get original set of words used for analysis. :param lemmas: A dictionary where keys are lemmas and values are sets or lists of words corresponding to that lemma. :type lemmas: dict :return: Set of words that exist as values in the dictionary :rtype: set ''' words = set() for lemma in lemmas: words.update(set(lemmas[lemma])) return words def _truncate(words, cutlength): '''Group words by stems defined by truncating them at given length. :param words: Set of words used for analysis :param cutlength: Words are stemmed by cutting at this length. :type words: set or list :type cutlength: int :return: Dictionary where keys are stems and values are sets of words corresponding to that stem. :rtype: dict ''' stems = {} for word in words: stem = word[:cutlength] try: stems[stem].update([word]) except KeyError: stems[stem] = set([word]) return stems # Reference: http://en.wikipedia.org/wiki/Line-line_intersection def _count_intersection(l1, l2): '''Count intersection between two line segments defined by coordinate pairs. :param l1: Tuple of two coordinate pairs defining the first line segment :param l2: Tuple of two coordinate pairs defining the second line segment :type l1: tuple :type l2: tuple :return: Coordinates of the intersection :rtype: tuple ''' x1, y1 = l1[0] x2, y2 = l1[1] x3, y3 = l2[0] x4, y4 = l2[1] denominator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4) if denominator == 0.0: # lines are parallel if x1 == x2 == x3 == x4 == 0.0: # When lines are parallel, they must be on the y-axis. # We can ignore x-axis because we stop counting the # truncation line when we get there. # There are no other options as UI (x-axis) grows and # OI (y-axis) diminishes when we go along the truncation line. return (0.0, y4) x = ((x1 * y2 - y1 * x2) * (x3 - x4) - (x1 - x2) * (x3 * y4 - y3 * x4)) / denominator y = ((x1 * y2 - y1 * x2) * (y3 - y4) - (y1 - y2) * (x3 * y4 - y3 * x4)) / denominator return (x, y) def _get_derivative(coordinates): '''Get derivative of the line from (0,0) to given coordinates. :param coordinates: A coordinate pair :type coordinates: tuple :return: Derivative; inf if x is zero :rtype: float ''' try: return coordinates[1] / coordinates[0] except ZeroDivisionError: return float('inf') def _calculate_cut(lemmawords, stems): '''Count understemmed and overstemmed pairs for (lemma, stem) pair with common words. :param lemmawords: Set or list of words corresponding to certain lemma. :param stems: A dictionary where keys are stems and values are sets or lists of words corresponding to that stem. :type lemmawords: set or list :type stems: dict :return: Amount of understemmed and overstemmed pairs contributed by words existing in both lemmawords and stems. :rtype: tuple ''' umt, wmt = 0.0, 0.0 for stem in stems: cut = set(lemmawords) & set(stems[stem]) if cut: cutcount = len(cut) stemcount = len(stems[stem]) # Unachieved merge total umt += cutcount * (len(lemmawords) - cutcount) # Wrongly merged total wmt += cutcount * (stemcount - cutcount) return (umt, wmt) def _calculate(lemmas, stems): '''Calculate actual and maximum possible amounts of understemmed and overstemmed word pairs. :param lemmas: A dictionary where keys are lemmas and values are sets or lists of words corresponding to that lemma. :param stems: A dictionary where keys are stems and values are sets or lists of words corresponding to that stem. :type lemmas: dict :type stems: dict :return: Global unachieved merge total (gumt), global desired merge total (gdmt), global wrongly merged total (gwmt) and global desired non-merge total (gdnt). :rtype: tuple ''' n = sum(len(lemmas[word]) for word in lemmas) gdmt, gdnt, gumt, gwmt = (0.0, 0.0, 0.0, 0.0) for lemma in lemmas: lemmacount = len(lemmas[lemma]) # Desired merge total gdmt += lemmacount * (lemmacount - 1) # Desired non-merge total gdnt += lemmacount * (n - lemmacount) # For each (lemma, stem) pair with common words, count how many # pairs are understemmed and overstemmed. umt, wmt = _calculate_cut(lemmas[lemma], stems) # Add to total undesired and wrongly-merged totals gumt += umt gwmt += wmt # Each object is counted twice, so divide by two return (gumt / 2, gdmt / 2, gwmt / 2, gdnt / 2) def _indexes(gumt, gdmt, gwmt, gdnt): '''Count Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW). :param gumt, gdmt, gwmt, gdnt: Global unachieved merge total (gumt), global desired merge total (gdmt), global wrongly merged total (gwmt) and global desired non-merge total (gdnt). :type gumt, gdmt, gwmt, gdnt: float :return: Understemming Index (UI), Overstemming Index (OI) and Stemming Weight (SW). :rtype: tuple ''' # Calculate Understemming Index (UI), # Overstemming Index (OI) and Stemming Weight (SW) try: ui = gumt / gdmt except ZeroDivisionError: # If GDMT (max merge total) is 0, define UI as 0 ui = 0.0 try: oi = gwmt / gdnt except ZeroDivisionError: # IF GDNT (max non-merge total) is 0, define OI as 0 oi = 0.0 try: sw = oi / ui except ZeroDivisionError: if oi == 0.0: # OI and UI are 0, define SW as 'not a number' sw = float('nan') else: # UI is 0, define SW as infinity sw = float('inf') return (ui, oi, sw) class Paice(object): '''Class for storing lemmas, stems and evaluation metrics.''' def __init__(self, lemmas, stems): ''' :param lemmas: A dictionary where keys are lemmas and values are sets or lists of words corresponding to that lemma. :param stems: A dictionary where keys are stems and values are sets or lists of words corresponding to that stem. :type lemmas: dict :type stems: dict ''' self.lemmas = lemmas self.stems = stems self.coords = [] self.gumt, self.gdmt, self.gwmt, self.gdnt = (None, None, None, None) self.ui, self.oi, self.sw = (None, None, None) self.errt = None self.update() def __str__(self): text = ['Global Unachieved Merge Total (GUMT): %s\n' % self.gumt] text.append('Global Desired Merge Total (GDMT): %s\n' % self.gdmt) text.append('Global Wrongly-Merged Total (GWMT): %s\n' % self.gwmt) text.append('Global Desired Non-merge Total (GDNT): %s\n' % self.gdnt) text.append('Understemming Index (GUMT / GDMT): %s\n' % self.ui) text.append('Overstemming Index (GWMT / GDNT): %s\n' % self.oi) text.append('Stemming Weight (OI / UI): %s\n' % self.sw) text.append('Error-Rate Relative to Truncation (ERRT): %s\r\n' % self.errt) coordinates = ' '.join(['(%s, %s)' % item for item in self.coords]) text.append('Truncation line: %s' % coordinates) return ''.join(text) def _get_truncation_indexes(self, words, cutlength): '''Count (UI, OI) when stemming is done by truncating words at \'cutlength\'. :param words: Words used for the analysis :param cutlength: Words are stemmed by cutting them at this length :type words: set or list :type cutlength: int :return: Understemming and overstemming indexes :rtype: tuple ''' truncated = _truncate(words, cutlength) gumt, gdmt, gwmt, gdnt = _calculate(self.lemmas, truncated) ui, oi = _indexes(gumt, gdmt, gwmt, gdnt)[:2] return (ui, oi) def _get_truncation_coordinates(self, cutlength=0): '''Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line. :param cutlength: Optional parameter to start counting from (ui, oi) coordinates gotten by stemming at this length. Useful for speeding up the calculations when you know the approximate location of the intersection. :type cutlength: int :return: List of coordinate pairs that define the truncation line :rtype: list ''' words = get_words_from_dictionary(self.lemmas) maxlength = max(len(word) for word in words) # Truncate words from different points until (0, 0) - (ui, oi) segment crosses the truncation line coords = [] while cutlength <= maxlength: # Get (UI, OI) pair of current truncation point pair = self._get_truncation_indexes(words, cutlength) # Store only new coordinates so we'll have an actual # line segment when counting the intersection point if pair not in coords: coords.append(pair) if pair == (0.0, 0.0): # Stop counting if truncation line goes through origo; # length from origo to truncation line is 0 return coords if len(coords) >= 2 and pair[0] > 0.0: derivative1 = _get_derivative(coords[-2]) derivative2 = _get_derivative(coords[-1]) # Derivative of the truncation line is a decreasing value; # when it passes Stemming Weight, we've found the segment # of truncation line intersecting with (0, 0) - (ui, oi) segment if derivative1 >= self.sw >= derivative2: return coords cutlength += 1 return coords def _errt(self): '''Count Error-Rate Relative to Truncation (ERRT). :return: ERRT, length of the line from origo to (UI, OI) divided by the length of the line from origo to the point defined by the same line when extended until the truncation line. :rtype: float ''' # Count (UI, OI) pairs for truncation points until we find the segment where (ui, oi) crosses the truncation line self.coords = self._get_truncation_coordinates() if (0.0, 0.0) in self.coords: # Truncation line goes through origo, so ERRT cannot be counted if (self.ui, self.oi) != (0.0, 0.0): return float('inf') else: return float('nan') if (self.ui, self.oi) == (0.0, 0.0): # (ui, oi) is origo; define errt as 0.0 return 0.0 # Count the intersection point # Note that (self.ui, self.oi) cannot be (0.0, 0.0) and self.coords has different coordinates # so we have actual line segments instead of a line segment and a point intersection = _count_intersection(((0, 0), (self.ui, self.oi)), self.coords[-2:] ) # Count OP (length of the line from origo to (ui, oi)) op = sqrt(self.ui ** 2 + self.oi ** 2) # Count OT (length of the line from origo to truncation line that goes through (ui, oi)) ot = sqrt(intersection[0] ** 2 + intersection[1] ** 2) # OP / OT tells how well the stemming algorithm works compared to just truncating words return op / ot def update(self): '''Update statistics after lemmas and stems have been set.''' self.gumt, self.gdmt, self.gwmt, self.gdnt = _calculate(self.lemmas, self.stems) self.ui, self.oi, self.sw = _indexes(self.gumt, self.gdmt, self.gwmt, self.gdnt) self.errt = self._errt() def demo(): '''Demonstration of the module.''' # Some words with their real lemmas lemmas = {'kneel': ['kneel', 'knelt'], 'range': ['range', 'ranged'], 'ring': ['ring', 'rang', 'rung'] } # Same words with stems from a stemming algorithm stems = {'kneel': ['kneel'], 'knelt': ['knelt'], 'rang': ['rang', 'range', 'ranged'], 'ring': ['ring'], 'rung': ['rung'] } print('Words grouped by their lemmas:') for lemma in sorted(lemmas): print('%s => %s' % (lemma, ' '.join(lemmas[lemma]))) print() print('Same words grouped by a stemming algorithm:') for stem in sorted(stems): print('%s => %s' % (stem, ' '.join(stems[stem]))) print() p = Paice(lemmas, stems) print(p) print() # Let's "change" results from a stemming algorithm stems = {'kneel': ['kneel'], 'knelt': ['knelt'], 'rang': ['rang'], 'range': ['range', 'ranged'], 'ring': ['ring'], 'rung': ['rung'] } print('Counting stats after changing stemming results:') for stem in sorted(stems): print('%s => %s' % (stem, ' '.join(stems[stem]))) print() p.stems = stems p.update() print(p) if __name__ == '__main__': demo() nltk-3.1/nltk/metrics/scores.py0000644000076500000240000001724112607224144016351 0ustar sbstaff00000000000000# Natural Language Toolkit: Evaluation # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function from math import fabs import operator from random import shuffle from functools import reduce try: from scipy.stats.stats import betai except ImportError: betai = None from nltk.compat import xrange, izip from nltk.util import LazyConcatenation, LazyMap def accuracy(reference, test): """ Given a list of reference values and a corresponding list of test values, return the fraction of corresponding values that are equal. In particular, return the fraction of indices ``0= actual_stat: c += 1 if verbose and i % 10 == 0: print('pseudo-statistic: %f' % pseudo_stat) print('significance: %f' % (float(c + 1) / (i + 1))) print('-' * 60) significance = float(c + 1) / (shuffles + 1) if verbose: print('significance: %f' % significance) if betai: for phi in [0.01, 0.05, 0.10, 0.15, 0.25, 0.50]: print("prob(phi<=%f): %f" % (phi, betai(c, shuffles, phi))) return (significance, c, shuffles) def demo(): print('-'*75) reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() test = 'DET VB VB DET NN NN NN IN DET NN'.split() print('Reference =', reference) print('Test =', test) print('Accuracy:', accuracy(reference, test)) print('-'*75) reference_set = set(reference) test_set = set(test) print('Reference =', reference_set) print('Test = ', test_set) print('Precision:', precision(reference_set, test_set)) print(' Recall:', recall(reference_set, test_set)) print('F-Measure:', f_measure(reference_set, test_set)) print('-'*75) if __name__ == '__main__': demo() nltk-3.1/nltk/metrics/segmentation.py0000644000076500000240000001602212607224144017544 0ustar sbstaff00000000000000# Natural Language Toolkit: Text Segmentation Metrics # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # David Doukhan # URL: # For license information, see LICENSE.TXT """ Text Segmentation Metrics 1. Windowdiff Pevzner, L., and Hearst, M., A Critique and Improvement of an Evaluation Metric for Text Segmentation, Computational Linguistics 28, 19-36 2. Generalized Hamming Distance Bookstein A., Kulyukin V.A., Raita T. Generalized Hamming Distance Information Retrieval 5, 2002, pp 353-375 Baseline implementation in C++ http://digital.cs.usu.edu/~vkulyukin/vkweb/software/ghd/ghd.html Study describing benefits of Generalized Hamming Distance Versus WindowDiff for evaluating text segmentation tasks Begsten, Y. Quel indice pour mesurer l'efficacite en segmentation de textes ? TALN 2009 3. Pk text segmentation metric Beeferman D., Berger A., Lafferty J. (1999) Statistical Models for Text Segmentation Machine Learning, 34, 177-210 """ try: import numpy as np except ImportError: pass from nltk.compat import xrange def windowdiff(seg1, seg2, k, boundary="1", weighted=False): """ Compute the windowdiff score for a pair of segmentations. A segmentation is any sequence over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used to mark the edge of a segmentation. >>> s1 = "000100000010" >>> s2 = "000010000100" >>> s3 = "100000010000" >>> '%.2f' % windowdiff(s1, s1, 3) '0.00' >>> '%.2f' % windowdiff(s1, s2, 3) '0.30' >>> '%.2f' % windowdiff(s2, s3, 3) '0.80' :param seg1: a segmentation :type seg1: str or list :param seg2: a segmentation :type seg2: str or list :param k: window width :type k: int :param boundary: boundary value :type boundary: str or int or bool :param weighted: use the weighted variant of windowdiff :type weighted: boolean :rtype: float """ if len(seg1) != len(seg2): raise ValueError("Segmentations have unequal length") if k > len(seg1): raise ValueError("Window width k should be smaller or equal than segmentation lengths") wd = 0 for i in range(len(seg1) - k + 1): ndiff = abs(seg1[i:i+k].count(boundary) - seg2[i:i+k].count(boundary)) if weighted: wd += ndiff else: wd += min(1, ndiff) return wd / (len(seg1) - k + 1.) # Generalized Hamming Distance def _init_mat(nrows, ncols, ins_cost, del_cost): mat = np.empty((nrows, ncols)) mat[0, :] = ins_cost * np.arange(ncols) mat[:, 0] = del_cost * np.arange(nrows) return mat def _ghd_aux(mat, rowv, colv, ins_cost, del_cost, shift_cost_coeff): for i, rowi in enumerate(rowv): for j, colj in enumerate(colv): shift_cost = shift_cost_coeff * abs(rowi - colj) + mat[i, j] if rowi == colj: # boundaries are at the same location, no transformation required tcost = mat[i, j] elif rowi > colj: # boundary match through a deletion tcost = del_cost + mat[i, j + 1] else: # boundary match through an insertion tcost = ins_cost + mat[i + 1, j] mat[i + 1, j + 1] = min(tcost, shift_cost) def ghd(ref, hyp, ins_cost=2.0, del_cost=2.0, shift_cost_coeff=1.0, boundary='1'): """ Compute the Generalized Hamming Distance for a reference and a hypothetical segmentation, corresponding to the cost related to the transformation of the hypothetical segmentation into the reference segmentation through boundary insertion, deletion and shift operations. A segmentation is any sequence over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used to mark the edge of a segmentation. Recommended parameter values are a shift_cost_coeff of 2. Associated with a ins_cost, and del_cost equal to the mean segment length in the reference segmentation. >>> # Same examples as Kulyukin C++ implementation >>> ghd('1100100000', '1100010000', 1.0, 1.0, 0.5) 0.5 >>> ghd('1100100000', '1100000001', 1.0, 1.0, 0.5) 2.0 >>> ghd('011', '110', 1.0, 1.0, 0.5) 1.0 >>> ghd('1', '0', 1.0, 1.0, 0.5) 1.0 >>> ghd('111', '000', 1.0, 1.0, 0.5) 3.0 >>> ghd('000', '111', 1.0, 2.0, 0.5) 6.0 :param ref: the reference segmentation :type ref: str or list :param hyp: the hypothetical segmentation :type hyp: str or list :param ins_cost: insertion cost :type ins_cost: float :param del_cost: deletion cost :type del_cost: float :param shift_cost_coeff: constant used to compute the cost of a shift. shift cost = shift_cost_coeff * |i - j| where i and j are the positions indicating the shift :type shift_cost_coeff: float :param boundary: boundary value :type boundary: str or int or bool :rtype: float """ ref_idx = [i for (i, val) in enumerate(ref) if val == boundary] hyp_idx = [i for (i, val) in enumerate(hyp) if val == boundary] nref_bound = len(ref_idx) nhyp_bound = len(hyp_idx) if nref_bound == 0 and nhyp_bound == 0: return 0.0 elif nref_bound > 0 and nhyp_bound == 0: return nref_bound * ins_cost elif nref_bound == 0 and nhyp_bound > 0: return nhyp_bound * del_cost mat = _init_mat(nhyp_bound + 1, nref_bound + 1, ins_cost, del_cost) _ghd_aux(mat, hyp_idx, ref_idx, ins_cost, del_cost, shift_cost_coeff) return mat[-1, -1] # Beeferman's Pk text segmentation evaluation metric def pk(ref, hyp, k=None, boundary='1'): """ Compute the Pk metric for a pair of segmentations A segmentation is any sequence over a vocabulary of two items (e.g. "0", "1"), where the specified boundary value is used to mark the edge of a segmentation. >>> '%.2f' % pk('0100'*100, '1'*400, 2) '0.50' >>> '%.2f' % pk('0100'*100, '0'*400, 2) '0.50' >>> '%.2f' % pk('0100'*100, '0100'*100, 2) '0.00' :param ref: the reference segmentation :type ref: str or list :param hyp: the segmentation to evaluate :type hyp: str or list :param k: window size, if None, set to half of the average reference segment length :type boundary: str or int or bool :param boundary: boundary value :type boundary: str or int or bool :rtype: float """ if k is None: k = int(round(len(ref) / (ref.count(boundary) * 2.))) err = 0 for i in xrange(len(ref)-k +1): r = ref[i:i+k].count(boundary) > 0 h = hyp[i:i+k].count(boundary) > 0 if r != h: err += 1 return err / (len(ref)-k +1.) # skip doctests if numpy is not installed def setup_module(module): from nose import SkipTest try: import numpy except ImportError: raise SkipTest("numpy is required for nltk.metrics.segmentation") nltk-3.1/nltk/metrics/spearman.py0000644000076500000240000000412012607224144016651 0ustar sbstaff00000000000000# Natural Language Toolkit: Spearman Rank Correlation # # Copyright (C) 2001-2015 NLTK Project # Author: Joel Nothman # URL: # For license information, see LICENSE.TXT """ Tools for comparing ranked lists. """ def _rank_dists(ranks1, ranks2): """Finds the difference between the values in ranks1 and ranks2 for keys present in both dicts. If the arguments are not dicts, they are converted from (key, rank) sequences. """ ranks1 = dict(ranks1) ranks2 = dict(ranks2) for k in ranks1: try: yield k, ranks1[k] - ranks2[k] except KeyError: pass def spearman_correlation(ranks1, ranks2): """Returns the Spearman correlation coefficient for two rankings, which should be dicts or sequences of (key, rank). The coefficient ranges from -1.0 (ranks are opposite) to 1.0 (ranks are identical), and is only calculated for keys in both rankings (for meaningful results, remove keys present in only one list before ranking).""" n = 0 res = 0 for k, d in _rank_dists(ranks1, ranks2): res += d * d n += 1 try: return 1 - (6 * float(res) / (n * (n*n - 1))) except ZeroDivisionError: # Result is undefined if only one item is ranked return 0.0 def ranks_from_sequence(seq): """Given a sequence, yields each element with an increasing rank, suitable for use as an argument to ``spearman_correlation``. """ return ((k, i) for i, k in enumerate(seq)) def ranks_from_scores(scores, rank_gap=1e-15): """Given a sequence of (key, score) tuples, yields each key with an increasing rank, tying with previous key's rank if the difference between their scores is less than rank_gap. Suitable for use as an argument to ``spearman_correlation``. """ prev_score = None rank = 0 for i, (key, score) in enumerate(scores): try: if abs(score - prev_score) > rank_gap: rank = i except TypeError: pass yield key, rank prev_score = score nltk-3.1/nltk/misc/0000755000076500000240000000000012610001541013745 5ustar sbstaff00000000000000nltk-3.1/nltk/misc/__init__.py0000644000076500000240000000060612607224144016074 0ustar sbstaff00000000000000# Natural Language Toolkit: Miscellaneous modules # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT from nltk.misc.chomsky import generate_chomsky from nltk.misc.wordfinder import word_finder from nltk.misc.minimalset import MinimalSet from nltk.misc.babelfish import babelize_shell nltk-3.1/nltk/misc/babelfish.py0000644000076500000240000000060412574600335016255 0ustar sbstaff00000000000000""" This module previously provided an interface to Babelfish online translation service; this service is no longer available; this module is kept in NLTK source code in order to provide better error messages for people following the NLTK Book 2.0. """ from __future__ import print_function def babelize_shell(): print("Babelfish online translation service is no longer available.") nltk-3.1/nltk/misc/chomsky.py0000644000076500000240000001215612574600335016020 0ustar sbstaff00000000000000# Chomsky random text generator, version 1.1, Raymond Hettinger, 2005/09/13 # http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/440546 """ CHOMSKY is an aid to writing linguistic papers in the style of the great master. It is based on selected phrases taken from actual books and articles written by Noam Chomsky. Upon request, it assembles the phrases in the elegant stylistic patterns that Chomsky is noted for. To generate n sentences of linguistic wisdom, type (CHOMSKY n) -- for example (CHOMSKY 5) generates half a screen of linguistic truth. """ from __future__ import print_function leadins = """To characterize a linguistic level L, On the other hand, This suggests that It appears that Furthermore, We will bring evidence in favor of the following thesis: To provide a constituent structure for T(Z,K), From C1, it follows that For any transformation which is sufficiently diversified in \ application to be of any interest, Analogously, Clearly, Note that Of course, Suppose, for instance, that Thus With this clarification, Conversely, We have already seen that By combining adjunctions and certain deformations, I suggested that these results would follow from the assumption that If the position of the trace in (99c) were only relatively \ inaccessible to movement, However, this assumption is not correct, since Comparing these examples with their parasitic gap counterparts in \ (96) and (97), we see that In the discussion of resumptive pronouns following (81), So far, Nevertheless, For one thing, Summarizing, then, we assume that A consequence of the approach just outlined is that Presumably, On our assumptions, It may be, then, that It must be emphasized, once again, that Let us continue to suppose that Notice, incidentally, that """ # List of LEADINs to buy time. subjects = """ the notion of level of grammaticalness a case of semigrammaticalness of a different sort most of the methodological work in modern linguistics a subset of English sentences interesting on quite independent grounds the natural general principle that will subsume this case an important property of these three types of EC any associated supporting element the appearance of parasitic gaps in domains relatively inaccessible \ to ordinary extraction the speaker-hearer's linguistic intuition the descriptive power of the base component the earlier discussion of deviance this analysis of a formative as a pair of sets of features this selectionally introduced contextual feature a descriptively adequate grammar the fundamental error of regarding functional notions as categorial relational information the systematic use of complex symbols the theory of syntactic features developed earlier""" # List of SUBJECTs chosen for maximum professorial macho. verbs = """can be defined in such a way as to impose delimits suffices to account for cannot be arbitrary in is not subject to does not readily tolerate raises serious doubts about is not quite equivalent to does not affect the structure of may remedy and, at the same time, eliminate is not to be considered in determining is to be regarded as is unspecified with respect to is, apparently, determined by is necessary to impose an interpretation on appears to correlate rather closely with is rather different from""" #List of VERBs chosen for autorecursive obfuscation. objects = """ problems of phonemic and morphological analysis. a corpus of utterance tokens upon which conformity has been defined \ by the paired utterance test. the traditional practice of grammarians. the levels of acceptability from fairly high (e.g. (99a)) to virtual \ gibberish (e.g. (98d)). a stipulation to place the constructions into these various categories. a descriptive fact. a parasitic gap construction. the extended c-command discussed in connection with (34). the ultimate standard that determines the accuracy of any proposed grammar. the system of base rules exclusive of the lexicon. irrelevant intervening contexts in selectional rules. nondistinctness in the sense of distinctive feature theory. a general convention regarding the forms of the grammar. an abstract underlying order. an important distinction in language use. the requirement that branching is not tolerated within the dominance \ scope of a complex symbol. the strong generative capacity of the theory.""" # List of OBJECTs selected for profound sententiousness. import textwrap, random from itertools import chain, islice from nltk.compat import izip def generate_chomsky(times=5, line_length=72): parts = [] for part in (leadins, subjects, verbs, objects): phraselist = list(map(str.strip, part.splitlines())) random.shuffle(phraselist) parts.append(phraselist) output = chain(*islice(izip(*parts), 0, times)) print(textwrap.fill(" ".join(output), line_length)) if __name__ == '__main__': generate_chomsky() nltk-3.1/nltk/misc/minimalset.py0000644000076500000240000000547712607224144016512 0ustar sbstaff00000000000000# Natural Language Toolkit: Minimal Sets # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT from collections import defaultdict class MinimalSet(object): """ Find contexts where more than one possible target value can appear. E.g. if targets are word-initial letters, and contexts are the remainders of words, then we would like to find cases like "fat" vs "cat", and "training" vs "draining". If targets are parts-of-speech and contexts are words, then we would like to find cases like wind (noun) 'air in rapid motion', vs wind (verb) 'coil, wrap'. """ def __init__(self, parameters=None): """ Create a new minimal set. :param parameters: The (context, target, display) tuples for the item :type parameters: list(tuple(str, str, str)) """ self._targets = set() # the contrastive information self._contexts = set() # what we are controlling for self._seen = defaultdict(set) # to record what we have seen self._displays = {} # what we will display if parameters: for context, target, display in parameters: self.add(context, target, display) def add(self, context, target, display): """ Add a new item to the minimal set, having the specified context, target, and display form. :param context: The context in which the item of interest appears :type context: str :param target: The item of interest :type target: str :param display: The information to be reported for each item :type display: str """ # Store the set of targets that occurred in this context self._seen[context].add(target) # Keep track of which contexts and targets we have seen self._contexts.add(context) self._targets.add(target) # For a given context and target, store the display form self._displays[(context, target)] = display def contexts(self, minimum=2): """ Determine which contexts occurred with enough distinct targets. :param minimum: the minimum number of distinct target forms :type minimum: int :rtype list """ return [c for c in self._contexts if len(self._seen[c]) >= minimum] def display(self, context, target, default=""): if (context, target) in self._displays: return self._displays[(context, target)] else: return default def display_all(self, context): result = [] for target in self._targets: x = self.display(context, target) if x: result.append(x) return result def targets(self): return self._targets nltk-3.1/nltk/misc/sort.py0000644000076500000240000001037412607224144015327 0ustar sbstaff00000000000000# Natural Language Toolkit: List Sorting # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT """ This module provides a variety of list sorting algorithms, to illustrate the many different algorithms (recipes) for solving a problem, and how to analyze algorithms experimentally. """ from __future__ import print_function, division # These algorithms are taken from: # Levitin (2004) The Design and Analysis of Algorithms ################################################################## # Selection Sort ################################################################## def selection(a): """ Selection Sort: scan the list to find its smallest element, then swap it with the first element. The remainder of the list is one element smaller; apply the same method to this list, and so on. """ count = 0 for i in range(len(a) - 1): min = i for j in range(i+1, len(a)): if a[j] < a[min]: min = j count += 1 a[min],a[i] = a[i],a[min] return count ################################################################## # Bubble Sort ################################################################## def bubble(a): """ Bubble Sort: compare adjacent elements of the list left-to-right, and swap them if they are out of order. After one pass through the list swapping adjacent items, the largest item will be in the rightmost position. The remainder is one element smaller; apply the same method to this list, and so on. """ count = 0 for i in range(len(a)-1): for j in range(len(a)-i-1): if a[j+1] < a[j]: a[j],a[j+1] = a[j+1],a[j] count += 1 return count ################################################################## # Merge Sort ################################################################## def _merge_lists(b, c): count = 0 i = j = 0 a = [] while (i < len(b) and j < len(c)): count += 1 if b[i] <= c[j]: a.append(b[i]) i += 1 else: a.append(c[j]) j += 1 if i == len(b): a += c[j:] else: a += b[i:] return a, count def merge(a): """ Merge Sort: split the list in half, and sort each half, then combine the sorted halves. """ count = 0 if len(a) > 1: midpoint = len(a) // 2 b = a[:midpoint] c = a[midpoint:] count_b = merge(b) count_c = merge(c) result, count_a = _merge_lists(b, c) a[:] = result # copy the result back into a. count = count_a + count_b + count_c return count ################################################################## # Quick Sort ################################################################## def _partition(a, l, r): p = a[l]; i = l; j = r+1 count = 0 while True: while i < r: i += 1 if a[i] >= p: break while j > l: j -= 1 if j < l or a[j] <= p: break a[i],a[j] = a[j],a[i] # swap count += 1 if i >= j: break a[i],a[j] = a[j],a[i] # undo last swap a[l],a[j] = a[j],a[l] return j, count def _quick(a, l, r): count = 0 if l # URL: # For license information, see LICENSE.TXT # Simplified from PHP version by Robert Klein # http://fswordfinder.sourceforge.net/ from __future__ import print_function import random # reverse a word with probability 0.5 def revword(word): if random.randint(1,2) == 1: return word[::-1] return word # try to insert word at position x,y; direction encoded in xf,yf def step(word, x, xf, y, yf, grid): for i in range(len(word)): if grid[xf(i)][yf(i)] != "" and grid[xf(i)][yf(i)] != word[i]: return False for i in range(len(word)): grid[xf(i)][yf(i)] = word[i] return True # try to insert word at position x,y, in direction dir def check(word, dir, x, y, grid, rows, cols): if dir==1: if x-len(word)<0 or y-len(word)<0: return False return step(word, x, lambda i:x-i, y, lambda i:y-i, grid) elif dir==2: if x-len(word)<0: return False return step(word, x, lambda i:x-i, y, lambda i:y, grid) elif dir==3: if x-len(word)<0 or y+(len(word)-1)>=cols: return False return step(word, x, lambda i:x-i, y, lambda i:y+i, grid) elif dir==4: if y-len(word)<0: return False return step(word, x, lambda i:x, y, lambda i:y-i, grid) def wordfinder(words, rows=20, cols=20, attempts=50, alph='ABCDEFGHIJKLMNOPQRSTUVWXYZ'): """ Attempt to arrange words into a letter-grid with the specified number of rows and columns. Try each word in several positions and directions, until it can be fitted into the grid, or the maximum number of allowable attempts is exceeded. Returns a tuple consisting of the grid and the words that were successfully placed. :param words: the list of words to be put into the grid :type words: list :param rows: the number of rows in the grid :type rows: int :param cols: the number of columns in the grid :type cols: int :param attempts: the number of times to attempt placing a word :type attempts: int :param alph: the alphabet, to be used for filling blank cells :type alph: list :rtype: tuple """ # place longer words first words = sorted(words, key=len, reverse=True) grid = [] # the letter grid used = [] # the words we used # initialize the grid for i in range(rows): grid.append([""] * cols) # try to place each word for word in words: word = word.strip().upper() # normalize save = word # keep a record of the word word = revword(word) for attempt in range(attempts): r = random.randint(0, len(word)) dir = random.choice([1,2,3,4]) x = random.randint(0,rows) y = random.randint(0,cols) if dir==1: x+=r; y+=r elif dir==2: x+=r elif dir==3: x+=r; y-=r elif dir==4: y+=r if 0<=x # Edward Loper # URL: # For license information, see LICENSE.TXT # """ NLTK Parsers Classes and interfaces for producing tree structures that represent the internal organization of a text. This task is known as "parsing" the text, and the resulting tree structures are called the text's "parses". Typically, the text is a single sentence, and the tree structure represents the syntactic structure of the sentence. However, parsers can also be used in other domains. For example, parsers can be used to derive the morphological structure of the morphemes that make up a word, or to derive the discourse structure for a set of utterances. Sometimes, a single piece of text can be represented by more than one tree structure. Texts represented by more than one tree structure are called "ambiguous" texts. Note that there are actually two ways in which a text can be ambiguous: - The text has multiple correct parses. - There is not enough information to decide which of several candidate parses is correct. However, the parser module does *not* distinguish these two types of ambiguity. The parser module defines ``ParserI``, a standard interface for parsing texts; and two simple implementations of that interface, ``ShiftReduceParser`` and ``RecursiveDescentParser``. It also contains three sub-modules for specialized kinds of parsing: - ``nltk.parser.chart`` defines chart parsing, which uses dynamic programming to efficiently parse texts. - ``nltk.parser.probabilistic`` defines probabilistic parsing, which associates a probability with each parse. """ from nltk.parse.api import ParserI from nltk.parse.chart import (ChartParser, SteppingChartParser, TopDownChartParser, BottomUpChartParser, BottomUpLeftCornerChartParser, LeftCornerChartParser) from nltk.parse.featurechart import (FeatureChartParser, FeatureTopDownChartParser, FeatureBottomUpChartParser, FeatureBottomUpLeftCornerChartParser) from nltk.parse.earleychart import (IncrementalChartParser, EarleyChartParser, IncrementalTopDownChartParser, IncrementalBottomUpChartParser, IncrementalBottomUpLeftCornerChartParser, IncrementalLeftCornerChartParser, FeatureIncrementalChartParser, FeatureEarleyChartParser, FeatureIncrementalTopDownChartParser, FeatureIncrementalBottomUpChartParser, FeatureIncrementalBottomUpLeftCornerChartParser) from nltk.parse.pchart import (BottomUpProbabilisticChartParser, InsideChartParser, RandomChartParser, UnsortedChartParser, LongestChartParser) from nltk.parse.recursivedescent import (RecursiveDescentParser, SteppingRecursiveDescentParser) from nltk.parse.shiftreduce import (ShiftReduceParser, SteppingShiftReduceParser) from nltk.parse.util import load_parser, TestGrammar, extract_test_sentences from nltk.parse.viterbi import ViterbiParser from nltk.parse.dependencygraph import DependencyGraph from nltk.parse.projectivedependencyparser import (ProjectiveDependencyParser, ProbabilisticProjectiveDependencyParser) from nltk.parse.nonprojectivedependencyparser import (NonprojectiveDependencyParser, NaiveBayesDependencyScorer, ProbabilisticNonprojectiveParser) from nltk.parse.malt import MaltParser from nltk.parse.evaluate import DependencyEvaluator from nltk.parse.transitionparser import TransitionParser from nltk.parse.bllip import BllipParser nltk-3.1/nltk/parse/api.py0000644000076500000240000000425112607224144015265 0ustar sbstaff00000000000000# Natural Language Toolkit: Parser API # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT # import itertools from nltk.internals import overridden class ParserI(object): """ A processing class for deriving trees that represent possible structures for a sequence of tokens. These tree structures are known as "parses". Typically, parsers are used to derive syntax trees for sentences. But parsers can also be used to derive other kinds of tree structure, such as morphological trees and discourse structures. Subclasses must define: - at least one of: ``parse()``, ``parse_sents()``. Subclasses may define: - ``grammar()`` """ def grammar(self): """ :return: The grammar used by this parser. """ raise NotImplementedError() def parse(self, sent, *args, **kwargs): """ :return: An iterator that generates parse trees for the sentence. When possible this list is sorted from most likely to least likely. :param sent: The sentence to be parsed :type sent: list(str) :rtype: iter(Tree) """ if overridden(self.parse_sents): return next(self.parse_sents([sent], *args, **kwargs)) elif overridden(self.parse_one): return (tree for tree in [self.parse_one(sent, *args, **kwargs)] if tree is not None) elif overridden(self.parse_all): return iter(self.parse_all(sent, *args, **kwargs)) else: raise NotImplementedError() def parse_sents(self, sents, *args, **kwargs): """ Apply ``self.parse()`` to each element of ``sents``. :rtype: iter(iter(Tree)) """ return (self.parse(sent, *args, **kwargs) for sent in sents) def parse_all(self, sent, *args, **kwargs): """:rtype: list(Tree)""" return list(self.parse(sent, *args, **kwargs)) def parse_one(self, sent, *args, **kwargs): """:rtype: Tree or None""" return next(self.parse(sent, *args, **kwargs), None) nltk-3.1/nltk/parse/bllip.py0000644000076500000240000002516712607224144015627 0ustar sbstaff00000000000000# Natural Language Toolkit: Interface to BLLIP Parser # # Author: David McClosky # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import print_function from nltk.parse.api import ParserI from nltk.tree import Tree """ Interface for parsing with BLLIP Parser. Requires the Python bllipparser module. BllipParser objects can be constructed with the ``BllipParser.from_unified_model_dir`` class method or manually using the ``BllipParser`` constructor. The former is generally easier if you have a BLLIP Parser unified model directory -- a basic model can be obtained from NLTK's downloader. More unified parsing models can be obtained with BLLIP Parser's ModelFetcher (run ``python -m bllipparser.ModelFetcher`` or see docs for ``bllipparser.ModelFetcher.download_and_install_model``). Basic usage:: # download and install a basic unified parsing model (Wall Street Journal) # sudo python -m nltk.downloader bllip_wsj_no_aux >>> from nltk.data import find >>> model_dir = find('models/bllip_wsj_no_aux').path >>> bllip = BllipParser.from_unified_model_dir(model_dir) # 1-best parsing >>> sentence1 = 'British left waffles on Falklands .'.split() >>> top_parse = bllip.parse_one(sentence1) >>> print(top_parse) (S1 (S (NP (JJ British) (NN left)) (VP (VBZ waffles) (PP (IN on) (NP (NNP Falklands)))) (. .))) # n-best parsing >>> sentence2 = 'Time flies'.split() >>> all_parses = bllip.parse_all(sentence2) >>> print(len(all_parses)) 50 >>> print(all_parses[0]) (S1 (S (NP (NNP Time)) (VP (VBZ flies)))) # incorporating external tagging constraints (None means unconstrained tag) >>> constrained1 = bllip.tagged_parse([('Time', 'VB'), ('flies', 'NNS')]) >>> print(next(constrained1)) (S1 (NP (VB Time) (NNS flies))) >>> constrained2 = bllip.tagged_parse([('Time', 'NN'), ('flies', None)]) >>> print(next(constrained2)) (S1 (NP (NN Time) (VBZ flies))) References ---------- - Charniak, Eugene. "A maximum-entropy-inspired parser." Proceedings of the 1st North American chapter of the Association for Computational Linguistics conference. Association for Computational Linguistics, 2000. - Charniak, Eugene, and Mark Johnson. "Coarse-to-fine n-best parsing and MaxEnt discriminative reranking." Proceedings of the 43rd Annual Meeting on Association for Computational Linguistics. Association for Computational Linguistics, 2005. Known issues ------------ Note that BLLIP Parser is not currently threadsafe. Since this module uses a SWIG interface, it is potentially unsafe to create multiple ``BllipParser`` objects in the same process. BLLIP Parser currently has issues with non-ASCII text and will raise an error if given any. See http://pypi.python.org/pypi/bllipparser/ for more information on BLLIP Parser's Python interface. """ __all__ = ['BllipParser'] # this block allows this module to be imported even if bllipparser isn't # available try: from bllipparser import RerankingParser from bllipparser.RerankingParser import get_unified_model_parameters def _ensure_bllip_import_or_error(): pass except ImportError as ie: def _ensure_bllip_import_or_error(ie=ie): raise ImportError("Couldn't import bllipparser module: %s" % ie) def _ensure_ascii(words): try: for i, word in enumerate(words): word.decode('ascii') except UnicodeDecodeError: raise ValueError("Token %d (%r) is non-ASCII. BLLIP Parser " "currently doesn't support non-ASCII inputs." % (i, word)) def _scored_parse_to_nltk_tree(scored_parse): return Tree.fromstring(str(scored_parse.ptb_parse)) class BllipParser(ParserI): """ Interface for parsing with BLLIP Parser. BllipParser objects can be constructed with the ``BllipParser.from_unified_model_dir`` class method or manually using the ``BllipParser`` constructor. """ def __init__(self, parser_model=None, reranker_features=None, reranker_weights=None, parser_options=None, reranker_options=None): """ Load a BLLIP Parser model from scratch. You'll typically want to use the ``from_unified_model_dir()`` class method to construct this object. :param parser_model: Path to parser model directory :type parser_model: str :param reranker_features: Path the reranker model's features file :type reranker_features: str :param reranker_weights: Path the reranker model's weights file :type reranker_weights: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) """ _ensure_bllip_import_or_error() parser_options = parser_options or {} reranker_options = reranker_options or {} self.rrp = RerankingParser() self.rrp.load_parser_model(parser_model, **parser_options) if reranker_features and reranker_weights: self.rrp.load_reranker_model(features_filename=reranker_features, weights_filename=reranker_weights, **reranker_options) def parse(self, sentence): """ Use BLLIP Parser to parse a sentence. Takes a sentence as a list of words; it will be automatically tagged with this BLLIP Parser instance's tagger. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: The sentence to be parsed :type sentence: list(str) :rtype: iter(Tree) """ _ensure_ascii(sentence) nbest_list = self.rrp.parse(sentence) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) def tagged_parse(self, word_and_tag_pairs): """ Use BLLIP to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. BLLIP will attempt to use the tags provided but may use others if it can't come up with a complete parse subject to those constraints. You may also specify a tag as ``None`` to leave a token's tag unconstrained. :return: An iterator that generates parse trees for the sentence from most likely to least likely. :param sentence: Input sentence to parse as (word, tag) pairs :type sentence: list(tuple(str, str)) :rtype: iter(Tree) """ words = [] tag_map = {} for i, (word, tag) in enumerate(word_and_tag_pairs): words.append(word) if tag is not None: tag_map[i] = tag _ensure_ascii(words) nbest_list = self.rrp.parse_tagged(words, tag_map) for scored_parse in nbest_list: yield _scored_parse_to_nltk_tree(scored_parse) @classmethod def from_unified_model_dir(this_class, model_dir, parser_options=None, reranker_options=None): """ Create a ``BllipParser`` object from a unified parsing model directory. Unified parsing model directories are a standardized way of storing BLLIP parser and reranker models together on disk. See ``bllipparser.RerankingParser.get_unified_model_parameters()`` for more information about unified model directories. :return: A ``BllipParser`` object using the parser and reranker models in the model directory. :param model_dir: Path to the unified model directory. :type model_dir: str :param parser_options: optional dictionary of parser options, see ``bllipparser.RerankingParser.RerankingParser.load_parser_options()`` for more information. :type parser_options: dict(str) :param reranker_options: optional dictionary of reranker options, see ``bllipparser.RerankingParser.RerankingParser.load_reranker_model()`` for more information. :type reranker_options: dict(str) :rtype: BllipParser """ (parser_model_dir, reranker_features_filename, reranker_weights_filename) = get_unified_model_parameters(model_dir) return this_class(parser_model_dir, reranker_features_filename, reranker_weights_filename, parser_options, reranker_options) def demo(): """This assumes the Python module bllipparser is installed.""" # download and install a basic unified parsing model (Wall Street Journal) # sudo python -m nltk.downloader bllip_wsj_no_aux from nltk.data import find model_dir = find('models/bllip_wsj_no_aux').path print('Loading BLLIP Parsing models...') # the easiest way to get started is to use a unified model bllip = BllipParser.from_unified_model_dir(model_dir) print('Done.') sentence1 = 'British left waffles on Falklands .'.split() sentence2 = 'I saw the man with the telescope .'.split() # this sentence is known to fail under the WSJ parsing model fail1 = '# ! ? : -'.split() for sentence in (sentence1, sentence2, fail1): print('Sentence: %r' % ' '.join(sentence)) try: tree = next(bllip.parse(sentence)) print(tree) except StopIteration: print("(parse failed)") # n-best parsing demo for i, parse in enumerate(bllip.parse(sentence1)): print('parse %d:\n%s' % (i, parse)) # using external POS tag constraints print("forcing 'tree' to be 'NN':", next(bllip.tagged_parse([('A', None), ('tree', 'NN')]))) print("forcing 'A' to be 'DT' and 'tree' to be 'NNP':", next(bllip.tagged_parse([('A', 'DT'), ('tree', 'NNP')]))) # constraints don't have to make sense... (though on more complicated # sentences, they may cause the parse to fail) print("forcing 'A' to be 'NNP':", next(bllip.tagged_parse([('A', 'NNP'), ('tree', None)]))) def setup_module(module): from nose import SkipTest try: _ensure_bllip_import_or_error() except ImportError: raise SkipTest('doctests from nltk.parse.bllip are skipped because ' 'the bllipparser module is not installed') nltk-3.1/nltk/parse/chart.py0000644000076500000240000017027212607224144015624 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: A Chart Parser # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # Jean Mark Gawron # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT """ Data classes and parser implementations for "chart parsers", which use dynamic programming to efficiently parse a text. A chart parser derives parse trees for a text by iteratively adding "edges" to a "chart." Each edge represents a hypothesis about the tree structure for a subsequence of the text. The chart is a "blackboard" for composing and combining these hypotheses. When a chart parser begins parsing a text, it creates a new (empty) chart, spanning the text. It then incrementally adds new edges to the chart. A set of "chart rules" specifies the conditions under which new edges should be added to the chart. Once the chart reaches a stage where none of the chart rules adds any new edges, parsing is complete. Charts are encoded with the ``Chart`` class, and edges are encoded with the ``TreeEdge`` and ``LeafEdge`` classes. The chart parser module defines three chart parsers: - ``ChartParser`` is a simple and flexible chart parser. Given a set of chart rules, it will apply those rules to the chart until no more edges are added. - ``SteppingChartParser`` is a subclass of ``ChartParser`` that can be used to step through the parsing process. """ from __future__ import print_function, division, unicode_literals import itertools import re import warnings from nltk import compat from nltk.tree import Tree from nltk.grammar import PCFG, is_nonterminal, is_terminal from nltk.util import OrderedDict from nltk.internals import raise_unorderable_types from nltk.compat import (total_ordering, python_2_unicode_compatible, unicode_repr) from nltk.parse.api import ParserI ######################################################################## ## Edges ######################################################################## @total_ordering class EdgeI(object): """ A hypothesis about the structure of part of a sentence. Each edge records the fact that a structure is (partially) consistent with the sentence. An edge contains: - A span, indicating what part of the sentence is consistent with the hypothesized structure. - A left-hand side, specifying what kind of structure is hypothesized. - A right-hand side, specifying the contents of the hypothesized structure. - A dot position, indicating how much of the hypothesized structure is consistent with the sentence. Every edge is either complete or incomplete: - An edge is complete if its structure is fully consistent with the sentence. - An edge is incomplete if its structure is partially consistent with the sentence. For every incomplete edge, the span specifies a possible prefix for the edge's structure. There are two kinds of edge: - A ``TreeEdge`` records which trees have been found to be (partially) consistent with the text. - A ``LeafEdge`` records the tokens occurring in the text. The ``EdgeI`` interface provides a common interface to both types of edge, allowing chart parsers to treat them in a uniform manner. """ def __init__(self): if self.__class__ == EdgeI: raise TypeError('Edge is an abstract interface') #//////////////////////////////////////////////////////////// # Span #//////////////////////////////////////////////////////////// def span(self): """ Return a tuple ``(s, e)``, where ``tokens[s:e]`` is the portion of the sentence that is consistent with this edge's structure. :rtype: tuple(int, int) """ raise NotImplementedError() def start(self): """ Return the start index of this edge's span. :rtype: int """ raise NotImplementedError() def end(self): """ Return the end index of this edge's span. :rtype: int """ raise NotImplementedError() def length(self): """ Return the length of this edge's span. :rtype: int """ raise NotImplementedError() #//////////////////////////////////////////////////////////// # Left Hand Side #//////////////////////////////////////////////////////////// def lhs(self): """ Return this edge's left-hand side, which specifies what kind of structure is hypothesized by this edge. :see: ``TreeEdge`` and ``LeafEdge`` for a description of the left-hand side values for each edge type. """ raise NotImplementedError() #//////////////////////////////////////////////////////////// # Right Hand Side #//////////////////////////////////////////////////////////// def rhs(self): """ Return this edge's right-hand side, which specifies the content of the structure hypothesized by this edge. :see: ``TreeEdge`` and ``LeafEdge`` for a description of the right-hand side values for each edge type. """ raise NotImplementedError() def dot(self): """ Return this edge's dot position, which indicates how much of the hypothesized structure is consistent with the sentence. In particular, ``self.rhs[:dot]`` is consistent with ``tokens[self.start():self.end()]``. :rtype: int """ raise NotImplementedError() def nextsym(self): """ Return the element of this edge's right-hand side that immediately follows its dot. :rtype: Nonterminal or terminal or None """ raise NotImplementedError() def is_complete(self): """ Return True if this edge's structure is fully consistent with the text. :rtype: bool """ raise NotImplementedError() def is_incomplete(self): """ Return True if this edge's structure is partially consistent with the text. :rtype: bool """ raise NotImplementedError() #//////////////////////////////////////////////////////////// # Comparisons & hashing #//////////////////////////////////////////////////////////// def __eq__(self, other): return (self.__class__ is other.__class__ and self._comparison_key == other._comparison_key) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, EdgeI): raise_unorderable_types("<", self, other) if self.__class__ is other.__class__: return self._comparison_key < other._comparison_key else: return self.__class__.__name__ < other.__class__.__name__ def __hash__(self): try: return self._hash except AttributeError: self._hash = hash(self._comparison_key) return self._hash @python_2_unicode_compatible class TreeEdge(EdgeI): """ An edge that records the fact that a tree is (partially) consistent with the sentence. A tree edge consists of: - A span, indicating what part of the sentence is consistent with the hypothesized tree. - A left-hand side, specifying the hypothesized tree's node value. - A right-hand side, specifying the hypothesized tree's children. Each element of the right-hand side is either a terminal, specifying a token with that terminal as its leaf value; or a nonterminal, specifying a subtree with that nonterminal's symbol as its node value. - A dot position, indicating which children are consistent with part of the sentence. In particular, if ``dot`` is the dot position, ``rhs`` is the right-hand size, ``(start,end)`` is the span, and ``sentence`` is the list of tokens in the sentence, then ``tokens[start:end]`` can be spanned by the children specified by ``rhs[:dot]``. For more information about edges, see the ``EdgeI`` interface. """ def __init__(self, span, lhs, rhs, dot=0): """ Construct a new ``TreeEdge``. :type span: tuple(int, int) :param span: A tuple ``(s, e)``, where ``tokens[s:e]`` is the portion of the sentence that is consistent with the new edge's structure. :type lhs: Nonterminal :param lhs: The new edge's left-hand side, specifying the hypothesized tree's node value. :type rhs: list(Nonterminal and str) :param rhs: The new edge's right-hand side, specifying the hypothesized tree's children. :type dot: int :param dot: The position of the new edge's dot. This position specifies what prefix of the production's right hand side is consistent with the text. In particular, if ``sentence`` is the list of tokens in the sentence, then ``okens[span[0]:span[1]]`` can be spanned by the children specified by ``rhs[:dot]``. """ self._span = span self._lhs = lhs rhs = tuple(rhs) self._rhs = rhs self._dot = dot self._comparison_key = (span, lhs, rhs, dot) @staticmethod def from_production(production, index): """ Return a new ``TreeEdge`` formed from the given production. The new edge's left-hand side and right-hand side will be taken from ``production``; its span will be ``(index,index)``; and its dot position will be ``0``. :rtype: TreeEdge """ return TreeEdge(span=(index, index), lhs=production.lhs(), rhs=production.rhs(), dot=0) def move_dot_forward(self, new_end): """ Return a new ``TreeEdge`` formed from this edge. The new edge's dot position is increased by ``1``, and its end index will be replaced by ``new_end``. :param new_end: The new end index. :type new_end: int :rtype: TreeEdge """ return TreeEdge(span=(self._span[0], new_end), lhs=self._lhs, rhs=self._rhs, dot=self._dot+1) # Accessors def lhs(self): return self._lhs def span(self): return self._span def start(self): return self._span[0] def end(self): return self._span[1] def length(self): return self._span[1] - self._span[0] def rhs(self): return self._rhs def dot(self): return self._dot def is_complete(self): return self._dot == len(self._rhs) def is_incomplete(self): return self._dot != len(self._rhs) def nextsym(self): if self._dot >= len(self._rhs): return None else: return self._rhs[self._dot] # String representation def __str__(self): str = '[%s:%s] ' % (self._span[0], self._span[1]) str += '%-2r ->' % (self._lhs,) for i in range(len(self._rhs)): if i == self._dot: str += ' *' str += ' %s' % unicode_repr(self._rhs[i]) if len(self._rhs) == self._dot: str += ' *' return str def __repr__(self): return '[Edge: %s]' % self @python_2_unicode_compatible class LeafEdge(EdgeI): """ An edge that records the fact that a leaf value is consistent with a word in the sentence. A leaf edge consists of: - An index, indicating the position of the word. - A leaf, specifying the word's content. A leaf edge's left-hand side is its leaf value, and its right hand side is ``()``. Its span is ``[index, index+1]``, and its dot position is ``0``. """ def __init__(self, leaf, index): """ Construct a new ``LeafEdge``. :param leaf: The new edge's leaf value, specifying the word that is recorded by this edge. :param index: The new edge's index, specifying the position of the word that is recorded by this edge. """ self._leaf = leaf self._index = index self._comparison_key = (leaf, index) # Accessors def lhs(self): return self._leaf def span(self): return (self._index, self._index+1) def start(self): return self._index def end(self): return self._index+1 def length(self): return 1 def rhs(self): return () def dot(self): return 0 def is_complete(self): return True def is_incomplete(self): return False def nextsym(self): return None # String representations def __str__(self): return '[%s:%s] %s' % (self._index, self._index+1, unicode_repr(self._leaf)) def __repr__(self): return '[Edge: %s]' % (self) ######################################################################## ## Chart ######################################################################## class Chart(object): """ A blackboard for hypotheses about the syntactic constituents of a sentence. A chart contains a set of edges, and each edge encodes a single hypothesis about the structure of some portion of the sentence. The ``select`` method can be used to select a specific collection of edges. For example ``chart.select(is_complete=True, start=0)`` yields all complete edges whose start indices are 0. To ensure the efficiency of these selection operations, ``Chart`` dynamically creates and maintains an index for each set of attributes that have been selected on. In order to reconstruct the trees that are represented by an edge, the chart associates each edge with a set of child pointer lists. A child pointer list is a list of the edges that license an edge's right-hand side. :ivar _tokens: The sentence that the chart covers. :ivar _num_leaves: The number of tokens. :ivar _edges: A list of the edges in the chart :ivar _edge_to_cpls: A dictionary mapping each edge to a set of child pointer lists that are associated with that edge. :ivar _indexes: A dictionary mapping tuples of edge attributes to indices, where each index maps the corresponding edge attribute values to lists of edges. """ def __init__(self, tokens): """ Construct a new chart. The chart is initialized with the leaf edges corresponding to the terminal leaves. :type tokens: list :param tokens: The sentence that this chart will be used to parse. """ # Record the sentence token and the sentence length. self._tokens = tuple(tokens) self._num_leaves = len(self._tokens) # Initialise the chart. self.initialize() def initialize(self): """ Clear the chart. """ # A list of edges contained in this chart. self._edges = [] # The set of child pointer lists associated with each edge. self._edge_to_cpls = {} # Indexes mapping attribute values to lists of edges # (used by select()). self._indexes = {} #//////////////////////////////////////////////////////////// # Sentence Access #//////////////////////////////////////////////////////////// def num_leaves(self): """ Return the number of words in this chart's sentence. :rtype: int """ return self._num_leaves def leaf(self, index): """ Return the leaf value of the word at the given index. :rtype: str """ return self._tokens[index] def leaves(self): """ Return a list of the leaf values of each word in the chart's sentence. :rtype: list(str) """ return self._tokens #//////////////////////////////////////////////////////////// # Edge access #//////////////////////////////////////////////////////////// def edges(self): """ Return a list of all edges in this chart. New edges that are added to the chart after the call to edges() will *not* be contained in this list. :rtype: list(EdgeI) :see: ``iteredges``, ``select`` """ return self._edges[:] def iteredges(self): """ Return an iterator over the edges in this chart. It is not guaranteed that new edges which are added to the chart before the iterator is exhausted will also be generated. :rtype: iter(EdgeI) :see: ``edges``, ``select`` """ return iter(self._edges) # Iterating over the chart yields its edges. __iter__ = iteredges def num_edges(self): """ Return the number of edges contained in this chart. :rtype: int """ return len(self._edge_to_cpls) def select(self, **restrictions): """ Return an iterator over the edges in this chart. Any new edges that are added to the chart before the iterator is exahusted will also be generated. ``restrictions`` can be used to restrict the set of edges that will be generated. :param span: Only generate edges ``e`` where ``e.span()==span`` :param start: Only generate edges ``e`` where ``e.start()==start`` :param end: Only generate edges ``e`` where ``e.end()==end`` :param length: Only generate edges ``e`` where ``e.length()==length`` :param lhs: Only generate edges ``e`` where ``e.lhs()==lhs`` :param rhs: Only generate edges ``e`` where ``e.rhs()==rhs`` :param nextsym: Only generate edges ``e`` where ``e.nextsym()==nextsym`` :param dot: Only generate edges ``e`` where ``e.dot()==dot`` :param is_complete: Only generate edges ``e`` where ``e.is_complete()==is_complete`` :param is_incomplete: Only generate edges ``e`` where ``e.is_incomplete()==is_incomplete`` :rtype: iter(EdgeI) """ # If there are no restrictions, then return all edges. if restrictions=={}: return iter(self._edges) # Find the index corresponding to the given restrictions. restr_keys = sorted(restrictions.keys()) restr_keys = tuple(restr_keys) # If it doesn't exist, then create it. if restr_keys not in self._indexes: self._add_index(restr_keys) vals = tuple(restrictions[key] for key in restr_keys) return iter(self._indexes[restr_keys].get(vals, [])) def _add_index(self, restr_keys): """ A helper function for ``select``, which creates a new index for a given set of attributes (aka restriction keys). """ # Make sure it's a valid index. for key in restr_keys: if not hasattr(EdgeI, key): raise ValueError('Bad restriction: %s' % key) # Create the index. index = self._indexes[restr_keys] = {} # Add all existing edges to the index. for edge in self._edges: vals = tuple(getattr(edge, key)() for key in restr_keys) index.setdefault(vals, []).append(edge) def _register_with_indexes(self, edge): """ A helper function for ``insert``, which registers the new edge with all existing indexes. """ for (restr_keys, index) in self._indexes.items(): vals = tuple(getattr(edge, key)() for key in restr_keys) index.setdefault(vals, []).append(edge) #//////////////////////////////////////////////////////////// # Edge Insertion #//////////////////////////////////////////////////////////// def insert_with_backpointer(self, new_edge, previous_edge, child_edge): """ Add a new edge to the chart, using a pointer to the previous edge. """ cpls = self.child_pointer_lists(previous_edge) new_cpls = [cpl+(child_edge,) for cpl in cpls] return self.insert(new_edge, *new_cpls) def insert(self, edge, *child_pointer_lists): """ Add a new edge to the chart, and return True if this operation modified the chart. In particular, return true iff the chart did not already contain ``edge``, or if it did not already associate ``child_pointer_lists`` with ``edge``. :type edge: EdgeI :param edge: The new edge :type child_pointer_lists: sequence of tuple(EdgeI) :param child_pointer_lists: A sequence of lists of the edges that were used to form this edge. This list is used to reconstruct the trees (or partial trees) that are associated with ``edge``. :rtype: bool """ # Is it a new edge? if edge not in self._edge_to_cpls: # Add it to the list of edges. self._append_edge(edge) # Register with indexes. self._register_with_indexes(edge) # Get the set of child pointer lists for this edge. cpls = self._edge_to_cpls.setdefault(edge, OrderedDict()) chart_was_modified = False for child_pointer_list in child_pointer_lists: child_pointer_list = tuple(child_pointer_list) if child_pointer_list not in cpls: # It's a new CPL; register it, and return true. cpls[child_pointer_list] = True chart_was_modified = True return chart_was_modified def _append_edge(self, edge): self._edges.append(edge) #//////////////////////////////////////////////////////////// # Tree extraction & child pointer lists #//////////////////////////////////////////////////////////// def parses(self, root, tree_class=Tree): """ Return an iterator of the complete tree structures that span the entire chart, and whose root node is ``root``. """ for edge in self.select(start=0, end=self._num_leaves, lhs=root): for tree in self.trees(edge, tree_class=tree_class, complete=True): yield tree def trees(self, edge, tree_class=Tree, complete=False): """ Return an iterator of the tree structures that are associated with ``edge``. If ``edge`` is incomplete, then the unexpanded children will be encoded as childless subtrees, whose node value is the corresponding terminal or nonterminal. :rtype: list(Tree) :note: If two trees share a common subtree, then the same Tree may be used to encode that subtree in both trees. If you need to eliminate this subtree sharing, then create a deep copy of each tree. """ return iter(self._trees(edge, complete, memo={}, tree_class=tree_class)) def _trees(self, edge, complete, memo, tree_class): """ A helper function for ``trees``. :param memo: A dictionary used to record the trees that we've generated for each edge, so that when we see an edge more than once, we can reuse the same trees. """ # If we've seen this edge before, then reuse our old answer. if edge in memo: return memo[edge] # when we're reading trees off the chart, don't use incomplete edges if complete and edge.is_incomplete(): return [] # Leaf edges. if isinstance(edge, LeafEdge): leaf = self._tokens[edge.start()] memo[edge] = [leaf] return [leaf] # Until we're done computing the trees for edge, set # memo[edge] to be empty. This has the effect of filtering # out any cyclic trees (i.e., trees that contain themselves as # descendants), because if we reach this edge via a cycle, # then it will appear that the edge doesn't generate any trees. memo[edge] = [] trees = [] lhs = edge.lhs().symbol() # Each child pointer list can be used to form trees. for cpl in self.child_pointer_lists(edge): # Get the set of child choices for each child pointer. # child_choices[i] is the set of choices for the tree's # ith child. child_choices = [self._trees(cp, complete, memo, tree_class) for cp in cpl] # For each combination of children, add a tree. for children in itertools.product(*child_choices): trees.append(tree_class(lhs, children)) # If the edge is incomplete, then extend it with "partial trees": if edge.is_incomplete(): unexpanded = [tree_class(elt,[]) for elt in edge.rhs()[edge.dot():]] for tree in trees: tree.extend(unexpanded) # Update the memoization dictionary. memo[edge] = trees # Return the list of trees. return trees def child_pointer_lists(self, edge): """ Return the set of child pointer lists for the given edge. Each child pointer list is a list of edges that have been used to form this edge. :rtype: list(list(EdgeI)) """ # Make a copy, in case they modify it. return self._edge_to_cpls.get(edge, {}).keys() #//////////////////////////////////////////////////////////// # Display #//////////////////////////////////////////////////////////// def pretty_format_edge(self, edge, width=None): """ Return a pretty-printed string representation of a given edge in this chart. :rtype: str :param width: The number of characters allotted to each index in the sentence. """ if width is None: width = 50 // (self.num_leaves()+1) (start, end) = (edge.start(), edge.end()) str = '|' + ('.'+' '*(width-1))*start # Zero-width edges are "#" if complete, ">" if incomplete if start == end: if edge.is_complete(): str += '#' else: str += '>' # Spanning complete edges are "[===]"; Other edges are # "[---]" if complete, "[--->" if incomplete elif edge.is_complete() and edge.span() == (0,self._num_leaves): str += '['+('='*width)*(end-start-1) + '='*(width-1)+']' elif edge.is_complete(): str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+']' else: str += '['+('-'*width)*(end-start-1) + '-'*(width-1)+'>' str += (' '*(width-1)+'.')*(self._num_leaves-end) return str + '| %s' % edge def pretty_format_leaves(self, width=None): """ Return a pretty-printed string representation of this chart's leaves. This string can be used as a header for calls to ``pretty_format_edge``. """ if width is None: width = 50 // (self.num_leaves()+1) if self._tokens is not None and width>1: header = '|.' for tok in self._tokens: header += tok[:width-1].center(width-1)+'.' header += '|' else: header = '' return header def pretty_format(self, width=None): """ Return a pretty-printed string representation of this chart. :param width: The number of characters allotted to each index in the sentence. :rtype: str """ if width is None: width = 50 // (self.num_leaves()+1) # sort edges: primary key=length, secondary key=start index. # (and filter out the token edges) edges = sorted([(e.length(), e.start(), e) for e in self]) edges = [e for (_,_,e) in edges] return (self.pretty_format_leaves(width) + '\n' + '\n'.join(self.pretty_format_edge(edge, width) for edge in edges)) #//////////////////////////////////////////////////////////// # Display: Dot (AT&T Graphviz) #//////////////////////////////////////////////////////////// def dot_digraph(self): # Header s = 'digraph nltk_chart {\n' #s += ' size="5,5";\n' s += ' rankdir=LR;\n' s += ' node [height=0.1,width=0.1];\n' s += ' node [style=filled, color="lightgray"];\n' # Set up the nodes for y in range(self.num_edges(), -1, -1): if y == 0: s += ' node [style=filled, color="black"];\n' for x in range(self.num_leaves()+1): if y == 0 or (x <= self._edges[y-1].start() or x >= self._edges[y-1].end()): s += ' %04d.%04d [label=""];\n' % (x,y) # Add a spacer s += ' x [style=invis]; x->0000.0000 [style=invis];\n' # Declare ranks. for x in range(self.num_leaves()+1): s += ' {rank=same;' for y in range(self.num_edges()+1): if y == 0 or (x <= self._edges[y-1].start() or x >= self._edges[y-1].end()): s += ' %04d.%04d' % (x,y) s += '}\n' # Add the leaves s += ' edge [style=invis, weight=100];\n' s += ' node [shape=plaintext]\n' s += ' 0000.0000' for x in range(self.num_leaves()): s += '->%s->%04d.0000' % (self.leaf(x), x+1) s += ';\n\n' # Add the edges s += ' edge [style=solid, weight=1];\n' for y, edge in enumerate(self): for x in range(edge.start()): s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % (x, y+1, x+1, y+1)) s += (' %04d.%04d -> %04d.%04d [label="%s"];\n' % (edge.start(), y+1, edge.end(), y+1, edge)) for x in range(edge.end(), self.num_leaves()): s += (' %04d.%04d -> %04d.%04d [style="invis"];\n' % (x, y+1, x+1, y+1)) s += '}\n' return s ######################################################################## ## Chart Rules ######################################################################## class ChartRuleI(object): """ A rule that specifies what new edges are licensed by any given set of existing edges. Each chart rule expects a fixed number of edges, as indicated by the class variable ``NUM_EDGES``. In particular: - A chart rule with ``NUM_EDGES=0`` specifies what new edges are licensed, regardless of existing edges. - A chart rule with ``NUM_EDGES=1`` specifies what new edges are licensed by a single existing edge. - A chart rule with ``NUM_EDGES=2`` specifies what new edges are licensed by a pair of existing edges. :type NUM_EDGES: int :cvar NUM_EDGES: The number of existing edges that this rule uses to license new edges. Typically, this number ranges from zero to two. """ def apply(self, chart, grammar, *edges): """ Return a generator that will add edges licensed by this rule and the given edges to the chart, one at a time. Each time the generator is resumed, it will either add a new edge and yield that edge; or return. :type edges: list(EdgeI) :param edges: A set of existing edges. The number of edges that should be passed to ``apply()`` is specified by the ``NUM_EDGES`` class variable. :rtype: iter(EdgeI) """ raise NotImplementedError() def apply_everywhere(self, chart, grammar): """ Return a generator that will add all edges licensed by this rule, given the edges that are currently in the chart, one at a time. Each time the generator is resumed, it will either add a new edge and yield that edge; or return. :rtype: iter(EdgeI) """ raise NotImplementedError() @python_2_unicode_compatible class AbstractChartRule(ChartRuleI): """ An abstract base class for chart rules. ``AbstractChartRule`` provides: - A default implementation for ``apply``. - A default implementation for ``apply_everywhere``, (Currently, this implementation assumes that ``NUM_EDGES``<=3.) - A default implementation for ``__str__``, which returns a name based on the rule's class name. """ # Subclasses must define apply. def apply(self, chart, grammar, *edges): raise NotImplementedError() # Default: loop through the given number of edges, and call # self.apply() for each set of edges. def apply_everywhere(self, chart, grammar): if self.NUM_EDGES == 0: for new_edge in self.apply(chart, grammar): yield new_edge elif self.NUM_EDGES == 1: for e1 in chart: for new_edge in self.apply(chart, grammar, e1): yield new_edge elif self.NUM_EDGES == 2: for e1 in chart: for e2 in chart: for new_edge in self.apply(chart, grammar, e1, e2): yield new_edge elif self.NUM_EDGES == 3: for e1 in chart: for e2 in chart: for e3 in chart: for new_edge in self.apply(chart,grammar,e1,e2,e3): yield new_edge else: raise AssertionError('NUM_EDGES>3 is not currently supported') # Default: return a name based on the class name. def __str__(self): # Add spaces between InitialCapsWords. return re.sub('([a-z])([A-Z])', r'\1 \2', self.__class__.__name__) #//////////////////////////////////////////////////////////// # Fundamental Rule #//////////////////////////////////////////////////////////// class FundamentalRule(AbstractChartRule): """ A rule that joins two adjacent edges to form a single combined edge. In particular, this rule specifies that any pair of edges - ``[A -> alpha \* B beta][i:j]`` - ``[B -> gamma \*][j:k]`` licenses the edge: - ``[A -> alpha B * beta][i:j]`` """ NUM_EDGES = 2 def apply(self, chart, grammar, left_edge, right_edge): # Make sure the rule is applicable. if not (left_edge.is_incomplete() and right_edge.is_complete() and left_edge.end() == right_edge.start() and left_edge.nextsym() == right_edge.lhs()): return # Construct the new edge. new_edge = left_edge.move_dot_forward(right_edge.end()) # Insert it into the chart. if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge class SingleEdgeFundamentalRule(FundamentalRule): """ A rule that joins a given edge with adjacent edges in the chart, to form combined edges. In particular, this rule specifies that either of the edges: - ``[A -> alpha \* B beta][i:j]`` - ``[B -> gamma \*][j:k]`` licenses the edge: - ``[A -> alpha B * beta][i:j]`` if the other edge is already in the chart. :note: This is basically ``FundamentalRule``, with one edge left unspecified. """ NUM_EDGES = 1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): for new_edge in self._apply_incomplete(chart, grammar, edge): yield new_edge else: for new_edge in self._apply_complete(chart, grammar, edge): yield new_edge def _apply_complete(self, chart, grammar, right_edge): for left_edge in chart.select(end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge def _apply_incomplete(self, chart, grammar, left_edge): for right_edge in chart.select(start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge #//////////////////////////////////////////////////////////// # Inserting Terminal Leafs #//////////////////////////////////////////////////////////// class LeafInitRule(AbstractChartRule): NUM_EDGES=0 def apply(self, chart, grammar): for index in range(chart.num_leaves()): new_edge = LeafEdge(chart.leaf(index), index) if chart.insert(new_edge, ()): yield new_edge #//////////////////////////////////////////////////////////// # Top-Down Prediction #//////////////////////////////////////////////////////////// class TopDownInitRule(AbstractChartRule): """ A rule licensing edges corresponding to the grammar productions for the grammar's start symbol. In particular, this rule specifies that ``[S -> \* alpha][0:i]`` is licensed for each grammar production ``S -> alpha``, where ``S`` is the grammar's start symbol. """ NUM_EDGES = 0 def apply(self, chart, grammar): for prod in grammar.productions(lhs=grammar.start()): new_edge = TreeEdge.from_production(prod, 0) if chart.insert(new_edge, ()): yield new_edge class TopDownPredictRule(AbstractChartRule): """ A rule licensing edges corresponding to the grammar productions for the nonterminal following an incomplete edge's dot. In particular, this rule specifies that ``[A -> alpha \* B beta][i:j]`` licenses the edge ``[B -> \* gamma][j:j]`` for each grammar production ``B -> gamma``. :note: This rule corresponds to the Predictor Rule in Earley parsing. """ NUM_EDGES = 1 def apply(self, chart, grammar, edge): if edge.is_complete(): return for prod in grammar.productions(lhs=edge.nextsym()): new_edge = TreeEdge.from_production(prod, edge.end()) if chart.insert(new_edge, ()): yield new_edge class CachedTopDownPredictRule(TopDownPredictRule): """ A cached version of ``TopDownPredictRule``. After the first time this rule is applied to an edge with a given ``end`` and ``next``, it will not generate any more edges for edges with that ``end`` and ``next``. If ``chart`` or ``grammar`` are changed, then the cache is flushed. """ def __init__(self): TopDownPredictRule.__init__(self) self._done = {} def apply(self, chart, grammar, edge): if edge.is_complete(): return nextsym, index = edge.nextsym(), edge.end() if not is_nonterminal(nextsym): return # If we've already applied this rule to an edge with the same # next & end, and the chart & grammar have not changed, then # just return (no new edges to add). done = self._done.get((nextsym, index), (None,None)) if done[0] is chart and done[1] is grammar: return # Add all the edges indicated by the top down expand rule. for prod in grammar.productions(lhs=nextsym): # If the left corner in the predicted production is # leaf, it must match with the input. if prod.rhs(): first = prod.rhs()[0] if is_terminal(first): if index >= chart.num_leaves() or first != chart.leaf(index): continue new_edge = TreeEdge.from_production(prod, index) if chart.insert(new_edge, ()): yield new_edge # Record the fact that we've applied this rule. self._done[nextsym, index] = (chart, grammar) #//////////////////////////////////////////////////////////// # Bottom-Up Prediction #//////////////////////////////////////////////////////////// class BottomUpPredictRule(AbstractChartRule): """ A rule licensing any edge corresponding to a production whose right-hand side begins with a complete edge's left-hand side. In particular, this rule specifies that ``[A -> alpha \*]`` licenses the edge ``[B -> \* A beta]`` for each grammar production ``B -> A beta``. """ NUM_EDGES = 1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(rhs=edge.lhs()): new_edge = TreeEdge.from_production(prod, edge.start()) if chart.insert(new_edge, ()): yield new_edge class BottomUpPredictCombineRule(BottomUpPredictRule): """ A rule licensing any edge corresponding to a production whose right-hand side begins with a complete edge's left-hand side. In particular, this rule specifies that ``[A -> alpha \*]`` licenses the edge ``[B -> A \* beta]`` for each grammar production ``B -> A beta``. :note: This is like ``BottomUpPredictRule``, but it also applies the ``FundamentalRule`` to the resulting edge. """ NUM_EDGES = 1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(rhs=edge.lhs()): new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) if chart.insert(new_edge, (edge,)): yield new_edge class EmptyPredictRule(AbstractChartRule): """ A rule that inserts all empty productions as passive edges, in every position in the chart. """ NUM_EDGES = 0 def apply(self, chart, grammar): for prod in grammar.productions(empty=True): for index in compat.xrange(chart.num_leaves() + 1): new_edge = TreeEdge.from_production(prod, index) if chart.insert(new_edge, ()): yield new_edge ######################################################################## ## Filtered Bottom Up ######################################################################## class FilteredSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): def _apply_complete(self, chart, grammar, right_edge): end = right_edge.end() nexttoken = end < chart.num_leaves() and chart.leaf(end) for left_edge in chart.select(end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()): if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge def _apply_incomplete(self, chart, grammar, left_edge): for right_edge in chart.select(start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()): end = right_edge.end() nexttoken = end < chart.num_leaves() and chart.leaf(end) if _bottomup_filter(grammar, nexttoken, left_edge.rhs(), left_edge.dot()): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge class FilteredBottomUpPredictCombineRule(BottomUpPredictCombineRule): def apply(self, chart, grammar, edge): if edge.is_incomplete(): return end = edge.end() nexttoken = end < chart.num_leaves() and chart.leaf(end) for prod in grammar.productions(rhs=edge.lhs()): if _bottomup_filter(grammar, nexttoken, prod.rhs()): new_edge = TreeEdge(edge.span(), prod.lhs(), prod.rhs(), 1) if chart.insert(new_edge, (edge,)): yield new_edge def _bottomup_filter(grammar, nexttoken, rhs, dot=0): if len(rhs) <= dot + 1: return True _next = rhs[dot + 1] if is_terminal(_next): return nexttoken == _next else: return grammar.is_leftcorner(_next, nexttoken) ######################################################################## ## Generic Chart Parser ######################################################################## TD_STRATEGY = [LeafInitRule(), TopDownInitRule(), CachedTopDownPredictRule(), SingleEdgeFundamentalRule()] BU_STRATEGY = [LeafInitRule(), EmptyPredictRule(), BottomUpPredictRule(), SingleEdgeFundamentalRule()] BU_LC_STRATEGY = [LeafInitRule(), EmptyPredictRule(), BottomUpPredictCombineRule(), SingleEdgeFundamentalRule()] LC_STRATEGY = [LeafInitRule(), FilteredBottomUpPredictCombineRule(), FilteredSingleEdgeFundamentalRule()] class ChartParser(ParserI): """ A generic chart parser. A "strategy", or list of ``ChartRuleI`` instances, is used to decide what edges to add to the chart. In particular, ``ChartParser`` uses the following algorithm to parse texts: | Until no new edges are added: | For each *rule* in *strategy*: | Apply *rule* to any applicable edges in the chart. | Return any complete parses in the chart """ def __init__(self, grammar, strategy=BU_LC_STRATEGY, trace=0, trace_chart_width=50, use_agenda=True, chart_class=Chart): """ Create a new chart parser, that uses ``grammar`` to parse texts. :type grammar: CFG :param grammar: The grammar used to parse texts. :type strategy: list(ChartRuleI) :param strategy: A list of rules that should be used to decide what edges to add to the chart (top-down strategy by default). :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. :type trace_chart_width: int :param trace_chart_width: The default total width reserved for the chart in trace output. The remainder of each line will be used to display edges. :type use_agenda: bool :param use_agenda: Use an optimized agenda-based algorithm, if possible. :param chart_class: The class that should be used to create the parse charts. """ self._grammar = grammar self._strategy = strategy self._trace = trace self._trace_chart_width = trace_chart_width # If the strategy only consists of axioms (NUM_EDGES==0) and # inference rules (NUM_EDGES==1), we can use an agenda-based algorithm: self._use_agenda = use_agenda self._chart_class = chart_class self._axioms = [] self._inference_rules = [] for rule in strategy: if rule.NUM_EDGES == 0: self._axioms.append(rule) elif rule.NUM_EDGES == 1: self._inference_rules.append(rule) else: self._use_agenda = False def grammar(self): return self._grammar def _trace_new_edges(self, chart, rule, new_edges, trace, edge_width): if not trace: return print_rule_header = trace > 1 for edge in new_edges: if print_rule_header: print('%s:' % rule) print_rule_header = False print(chart.pretty_format_edge(edge, edge_width)) def chart_parse(self, tokens, trace=None): """ Return the final parse ``Chart`` from which all possible parse trees can be extracted. :param tokens: The sentence to be parsed :type tokens: list(str) :rtype: Chart """ if trace is None: trace = self._trace trace_new_edges = self._trace_new_edges tokens = list(tokens) self._grammar.check_coverage(tokens) chart = self._chart_class(tokens) grammar = self._grammar # Width, for printing trace edges. trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1) if trace: print(chart.pretty_format_leaves(trace_edge_width)) if self._use_agenda: # Use an agenda-based algorithm. for axiom in self._axioms: new_edges = list(axiom.apply(chart, grammar)) trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) inference_rules = self._inference_rules agenda = chart.edges() # We reverse the initial agenda, since it is a stack # but chart.edges() functions as a queue. agenda.reverse() while agenda: edge = agenda.pop() for rule in inference_rules: new_edges = list(rule.apply(chart, grammar, edge)) if trace: trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) agenda += new_edges else: # Do not use an agenda-based algorithm. edges_added = True while edges_added: edges_added = False for rule in self._strategy: new_edges = list(rule.apply_everywhere(chart, grammar)) edges_added = len(new_edges) trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) # Return the final chart. return chart def parse(self, tokens, tree_class=Tree): chart = self.chart_parse(tokens) return iter(chart.parses(self._grammar.start(), tree_class=tree_class)) class TopDownChartParser(ChartParser): """ A ``ChartParser`` using a top-down parsing strategy. See ``ChartParser`` for more information. """ def __init__(self, grammar, **parser_args): ChartParser.__init__(self, grammar, TD_STRATEGY, **parser_args) class BottomUpChartParser(ChartParser): """ A ``ChartParser`` using a bottom-up parsing strategy. See ``ChartParser`` for more information. """ def __init__(self, grammar, **parser_args): if isinstance(grammar, PCFG): warnings.warn("BottomUpChartParser only works for CFG, " "use BottomUpProbabilisticChartParser instead", category=DeprecationWarning) ChartParser.__init__(self, grammar, BU_STRATEGY, **parser_args) class BottomUpLeftCornerChartParser(ChartParser): """ A ``ChartParser`` using a bottom-up left-corner parsing strategy. This strategy is often more efficient than standard bottom-up. See ``ChartParser`` for more information. """ def __init__(self, grammar, **parser_args): ChartParser.__init__(self, grammar, BU_LC_STRATEGY, **parser_args) class LeftCornerChartParser(ChartParser): def __init__(self, grammar, **parser_args): if not grammar.is_nonempty(): raise ValueError("LeftCornerParser only works for grammars " "without empty productions.") ChartParser.__init__(self, grammar, LC_STRATEGY, **parser_args) ######################################################################## ## Stepping Chart Parser ######################################################################## class SteppingChartParser(ChartParser): """ A ``ChartParser`` that allows you to step through the parsing process, adding a single edge at a time. It also allows you to change the parser's strategy or grammar midway through parsing a text. The ``initialize`` method is used to start parsing a text. ``step`` adds a single edge to the chart. ``set_strategy`` changes the strategy used by the chart parser. ``parses`` returns the set of parses that has been found by the chart parser. :ivar _restart: Records whether the parser's strategy, grammar, or chart has been changed. If so, then ``step`` must restart the parsing algorithm. """ def __init__(self, grammar, strategy=[], trace=0): self._chart = None self._current_chartrule = None self._restart = False ChartParser.__init__(self, grammar, strategy, trace) #//////////////////////////////////////////////////////////// # Initialization #//////////////////////////////////////////////////////////// def initialize(self, tokens): "Begin parsing the given tokens." self._chart = Chart(list(tokens)) self._restart = True #//////////////////////////////////////////////////////////// # Stepping #//////////////////////////////////////////////////////////// def step(self): """ Return a generator that adds edges to the chart, one at a time. Each time the generator is resumed, it adds a single edge and yields that edge. If no more edges can be added, then it yields None. If the parser's strategy, grammar, or chart is changed, then the generator will continue adding edges using the new strategy, grammar, or chart. Note that this generator never terminates, since the grammar or strategy might be changed to values that would add new edges. Instead, it yields None when no more edges can be added with the current strategy and grammar. """ if self._chart is None: raise ValueError('Parser must be initialized first') while True: self._restart = False w = 50 // (self._chart.num_leaves()+1) for e in self._parse(): if self._trace > 1: print(self._current_chartrule) if self._trace > 0: print(self._chart.pretty_format_edge(e,w)) yield e if self._restart: break else: yield None # No more edges. def _parse(self): """ A generator that implements the actual parsing algorithm. ``step`` iterates through this generator, and restarts it whenever the parser's strategy, grammar, or chart is modified. """ chart = self._chart grammar = self._grammar edges_added = 1 while edges_added > 0: edges_added = 0 for rule in self._strategy: self._current_chartrule = rule for e in rule.apply_everywhere(chart, grammar): edges_added += 1 yield e #//////////////////////////////////////////////////////////// # Accessors #//////////////////////////////////////////////////////////// def strategy(self): "Return the strategy used by this parser." return self._strategy def grammar(self): "Return the grammar used by this parser." return self._grammar def chart(self): "Return the chart that is used by this parser." return self._chart def current_chartrule(self): "Return the chart rule used to generate the most recent edge." return self._current_chartrule def parses(self, tree_class=Tree): "Return the parse trees currently contained in the chart." return self._chart.parses(self._grammar.start(), tree_class) #//////////////////////////////////////////////////////////// # Parser modification #//////////////////////////////////////////////////////////// def set_strategy(self, strategy): """ Change the strategy that the parser uses to decide which edges to add to the chart. :type strategy: list(ChartRuleI) :param strategy: A list of rules that should be used to decide what edges to add to the chart. """ if strategy == self._strategy: return self._strategy = strategy[:] # Make a copy. self._restart = True def set_grammar(self, grammar): "Change the grammar used by the parser." if grammar is self._grammar: return self._grammar = grammar self._restart = True def set_chart(self, chart): "Load a given chart into the chart parser." if chart is self._chart: return self._chart = chart self._restart = True #//////////////////////////////////////////////////////////// # Standard parser methods #//////////////////////////////////////////////////////////// def parse(self, tokens, tree_class=Tree): tokens = list(tokens) self._grammar.check_coverage(tokens) # Initialize ourselves. self.initialize(tokens) # Step until no more edges are generated. for e in self.step(): if e is None: break # Return an iterator of complete parses. return self.parses(tree_class=tree_class) ######################################################################## ## Demo Code ######################################################################## def demo_grammar(): from nltk.grammar import CFG return CFG.fromstring(""" S -> NP VP PP -> "with" NP NP -> NP PP VP -> VP PP VP -> Verb NP VP -> Verb NP -> Det Noun NP -> "John" NP -> "I" Det -> "the" Det -> "my" Det -> "a" Noun -> "dog" Noun -> "cookie" Verb -> "ate" Verb -> "saw" Prep -> "with" Prep -> "under" """) def demo(choice=None, print_times=True, print_grammar=False, print_trees=True, trace=2, sent='I saw John with a dog with my cookie', numparses=5): """ A demonstration of the chart parsers. """ import sys, time from nltk import nonterminals, Production, CFG # The grammar for ChartParser and SteppingChartParser: grammar = demo_grammar() if print_grammar: print("* Grammar") print(grammar) # Tokenize the sample sentence. print("* Sentence:") print(sent) tokens = sent.split() print(tokens) print() # Ask the user which parser to test, # if the parser wasn't provided as an argument if choice is None: print(' 1: Top-down chart parser') print(' 2: Bottom-up chart parser') print(' 3: Bottom-up left-corner chart parser') print(' 4: Left-corner chart parser with bottom-up filter') print(' 5: Stepping chart parser (alternating top-down & bottom-up)') print(' 6: All parsers') print('\nWhich parser (1-6)? ', end=' ') choice = sys.stdin.readline().strip() print() choice = str(choice) if choice not in "123456": print('Bad parser number') return # Keep track of how long each parser takes. times = {} strategies = {'1': ('Top-down', TD_STRATEGY), '2': ('Bottom-up', BU_STRATEGY), '3': ('Bottom-up left-corner', BU_LC_STRATEGY), '4': ('Filtered left-corner', LC_STRATEGY)} choices = [] if choice in strategies: choices = [choice] if choice=='6': choices = "1234" # Run the requested chart parser(s), except the stepping parser. for strategy in choices: print("* Strategy: " + strategies[strategy][0]) print() cp = ChartParser(grammar, strategies[strategy][1], trace=trace) t = time.time() chart = cp.chart_parse(tokens) parses = list(chart.parses(grammar.start())) times[strategies[strategy][0]] = time.time()-t print("Nr edges in chart:", len(chart.edges())) if numparses: assert len(parses)==numparses, 'Not all parses found' if print_trees: for tree in parses: print(tree) else: print("Nr trees:", len(parses)) print() # Run the stepping parser, if requested. if choice in "56": print("* Strategy: Stepping (top-down vs bottom-up)") print() t = time.time() cp = SteppingChartParser(grammar, trace=trace) cp.initialize(tokens) for i in range(5): print('*** SWITCH TO TOP DOWN') cp.set_strategy(TD_STRATEGY) for j, e in enumerate(cp.step()): if j>20 or e is None: break print('*** SWITCH TO BOTTOM UP') cp.set_strategy(BU_STRATEGY) for j, e in enumerate(cp.step()): if j>20 or e is None: break times['Stepping'] = time.time()-t print("Nr edges in chart:", len(cp.chart().edges())) if numparses: assert len(list(cp.parses()))==numparses, 'Not all parses found' if print_trees: for tree in cp.parses(): print(tree) else: print("Nr trees:", len(list(cp.parses()))) print() # Print the times of all parsers: if not (print_times and times): return print("* Parsing times") print() maxlen = max(len(key) for key in times) format = '%' + repr(maxlen) + 's parser: %6.3fsec' times_items = times.items() for (parser, t) in sorted(times_items, key=lambda a:a[1]): print(format % (parser, t)) if __name__ == '__main__': demo() nltk-3.1/nltk/parse/dependencygraph.py0000755000076500000240000007443212607224144017667 0ustar sbstaff00000000000000# Natural Language Toolkit: Dependency Grammars # # Copyright (C) 2001-2015 NLTK Project # Author: Jason Narad # Steven Bird (modifications) # # URL: # For license information, see LICENSE.TXT # """ Tools for reading and writing dependency trees. The input is assumed to be in Malt-TAB format (http://stp.lingfil.uu.se/~nivre/research/MaltXML.html). """ from __future__ import print_function, unicode_literals from collections import defaultdict from itertools import chain from pprint import pformat import subprocess import warnings from nltk.tree import Tree from nltk.compat import python_2_unicode_compatible, string_types ################################################################# # DependencyGraph Class ################################################################# @python_2_unicode_compatible class DependencyGraph(object): """ A container for the nodes and labelled edges of a dependency structure. """ def __init__(self, tree_str=None, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'): """Dependency graph. We place a dummy `TOP` node with the index 0, since the root node is often assigned 0 as its head. This also means that the indexing of the nodes corresponds directly to the Malt-TAB format, which starts at 1. If zero-based is True, then Malt-TAB-like input with node numbers starting at 0 and the root node assigned -1 (as produced by, e.g., zpar). :param str cell_separator: the cell separator. If not provided, cells are split by whitespace. :param str top_relation_label: the label by which the top relation is identified, for examlple, `ROOT`, `null` or `TOP`. """ self.nodes = defaultdict(lambda: {'address': None, 'word': None, 'lemma': None, 'ctag': None, 'tag': None, 'feats': None, 'head': None, 'deps': defaultdict(list), 'rel': None, }) self.nodes[0].update( { 'ctag': 'TOP', 'tag': 'TOP', 'address': 0, } ) self.root = None if tree_str: self._parse( tree_str, cell_extractor=cell_extractor, zero_based=zero_based, cell_separator=cell_separator, top_relation_label=top_relation_label, ) def remove_by_address(self, address): """ Removes the node with the given address. References to this node in others will still exist. """ del self.nodes[address] def redirect_arcs(self, originals, redirect): """ Redirects arcs to any of the nodes in the originals list to the redirect node address. """ for node in self.nodes.values(): new_deps = [] for dep in node['deps']: if dep in originals: new_deps.append(redirect) else: new_deps.append(dep) node['deps'] = new_deps def add_arc(self, head_address, mod_address): """ Adds an arc from the node specified by head_address to the node specified by the mod address. """ relation = self.nodes[mod_address]['rel'] self.nodes[head_address]['deps'].setdefault(relation, []) self.nodes[head_address]['deps'][relation].append(mod_address) #self.nodes[head_address]['deps'].append(mod_address) def connect_graph(self): """ Fully connects all non-root nodes. All nodes are set to be dependents of the root node. """ for node1 in self.nodes.values(): for node2 in self.nodes.values(): if node1['address'] != node2['address'] and node2['rel'] != 'TOP': relation = node2['rel'] node1['deps'].setdefault(relation, []) node1['deps'][relation].append(node2['address']) #node1['deps'].append(node2['address']) def get_by_address(self, node_address): """Return the node with the given address.""" return self.nodes[node_address] def contains_address(self, node_address): """ Returns true if the graph contains a node with the given node address, false otherwise. """ return node_address in self.nodes def to_dot(self): """Return a dot representation suitable for using with Graphviz. >>> dg = DependencyGraph( ... 'John N 2\\n' ... 'loves V 0\\n' ... 'Mary N 2' ... ) >>> print(dg.to_dot()) digraph G{ edge [dir=forward] node [shape=plaintext] 0 [label="0 (None)"] 0 -> 2 [label="ROOT"] 1 [label="1 (John)"] 2 [label="2 (loves)"] 2 -> 1 [label=""] 2 -> 3 [label=""] 3 [label="3 (Mary)"] } """ # Start the digraph specification s = 'digraph G{\n' s += 'edge [dir=forward]\n' s += 'node [shape=plaintext]\n' # Draw the remaining nodes for node in sorted(self.nodes.values(), key=lambda v: v['address']): s += '\n%s [label="%s (%s)"]' % (node['address'], node['address'], node['word']) for rel, deps in node['deps'].items(): for dep in deps: if rel is not None: s += '\n%s -> %s [label="%s"]' % (node['address'], dep, rel) else: s += '\n%s -> %s ' % (node['address'], dep) s += "\n}" return s def _repr_svg_(self): """Show SVG representation of the transducer (IPython magic). >>> dg = DependencyGraph( ... 'John N 2\\n' ... 'loves V 0\\n' ... 'Mary N 2' ... ) >>> dg._repr_svg_().split('\\n')[0] '' """ dot_string = self.to_dot() try: process = subprocess.Popen( ['dot', '-Tsvg'], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, ) except OSError: raise Exception('Cannot find the dot binary from Graphviz package') out, err = process.communicate(dot_string) if err: raise Exception( 'Cannot create svg representation by running dot from string: {}' ''.format(dot_string)) return out def __str__(self): return pformat(self.nodes) def __repr__(self): return "".format(len(self.nodes)) @staticmethod def load(filename, zero_based=False, cell_separator=None, top_relation_label='ROOT'): """ :param filename: a name of a file in Malt-TAB format :param zero_based: nodes in the input file are numbered starting from 0 rather than 1 (as produced by, e.g., zpar) :param str cell_separator: the cell separator. If not provided, cells are split by whitespace. :param str top_relation_label: the label by which the top relation is identified, for examlple, `ROOT`, `null` or `TOP`. :return: a list of DependencyGraphs """ with open(filename) as infile: return [ DependencyGraph( tree_str, zero_based=zero_based, cell_separator=cell_separator, top_relation_label=top_relation_label, ) for tree_str in infile.read().split('\n\n') ] def left_children(self, node_index): """ Returns the number of left children under the node specified by the given address. """ children = chain.from_iterable(self.nodes[node_index]['deps'].values()) index = self.nodes[node_index]['address'] return sum(1 for c in children if c < index) def right_children(self, node_index): """ Returns the number of right children under the node specified by the given address. """ children = chain.from_iterable(self.nodes[node_index]['deps'].values()) index = self.nodes[node_index]['address'] return sum(1 for c in children if c > index) def add_node(self, node): if not self.contains_address(node['address']): self.nodes[node['address']].update(node) def _parse(self, input_, cell_extractor=None, zero_based=False, cell_separator=None, top_relation_label='ROOT'): """Parse a sentence. :param extractor: a function that given a tuple of cells returns a 7-tuple, where the values are ``word, lemma, ctag, tag, feats, head, rel``. :param str cell_separator: the cell separator. If not provided, cells are split by whitespace. :param str top_relation_label: the label by which the top relation is identified, for examlple, `ROOT`, `null` or `TOP`. """ def extract_3_cells(cells, index): word, tag, head = cells return index, word, word, tag, tag, '', head, '' def extract_4_cells(cells, index): word, tag, head, rel = cells return index, word, word, tag, tag, '', head, rel def extract_7_cells(cells, index): line_index, word, lemma, tag, _, head, rel = cells try: index = int(line_index) except ValueError: # index can't be parsed as an integer, use default pass return index, word, lemma, tag, tag, '', head, rel def extract_10_cells(cells, index): line_index, word, lemma, ctag, tag, feats, head, rel, _, _ = cells try: index = int(line_index) except ValueError: # index can't be parsed as an integer, use default pass return index, word, lemma, ctag, tag, feats, head, rel extractors = { 3: extract_3_cells, 4: extract_4_cells, 7: extract_7_cells, 10: extract_10_cells, } if isinstance(input_, string_types): input_ = (line for line in input_.split('\n')) lines = (l.rstrip() for l in input_) lines = (l for l in lines if l) cell_number = None for index, line in enumerate(lines, start=1): cells = line.split(cell_separator) if cell_number is None: cell_number = len(cells) else: assert cell_number == len(cells) if cell_extractor is None: try: cell_extractor = extractors[cell_number] except KeyError: raise ValueError( 'Number of tab-delimited fields ({0}) not supported by ' 'CoNLL(10) or Malt-Tab(4) format'.format(cell_number) ) try: index, word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells, index) except (TypeError, ValueError): # cell_extractor doesn't take 2 arguments or doesn't return 8 # values; assume the cell_extractor is an older external # extractor and doesn't accept or return an index. word, lemma, ctag, tag, feats, head, rel = cell_extractor(cells) if head == '_': continue head = int(head) if zero_based: head += 1 self.nodes[index].update( { 'address': index, 'word': word, 'lemma': lemma, 'ctag': ctag, 'tag': tag, 'feats': feats, 'head': head, 'rel': rel, } ) # Make sure that the fake root node has labeled dependencies. if (cell_number == 3) and (head == 0): rel = top_relation_label self.nodes[head]['deps'][rel].append(index) if self.nodes[0]['deps'][top_relation_label]: root_address = self.nodes[0]['deps'][top_relation_label][0] self.root = self.nodes[root_address] self.top_relation_label = top_relation_label else: warnings.warn( "The graph doesn't contain a node " "that depends on the root element." ) def _word(self, node, filter=True): w = node['word'] if filter: if w != ',': return w return w def _tree(self, i): """ Turn dependency graphs into NLTK trees. :param int i: index of a node :return: either a word (if the indexed node is a leaf) or a ``Tree``. """ node = self.get_by_address(i) word = node['word'] deps = sorted(chain.from_iterable(node['deps'].values())) if deps: return Tree(word, [self._tree(dep) for dep in deps]) else: return word def tree(self): """ Starting with the ``root`` node, build a dependency tree using the NLTK ``Tree`` constructor. Dependency labels are omitted. """ node = self.root word = node['word'] deps = sorted(chain.from_iterable(node['deps'].values())) return Tree(word, [self._tree(dep) for dep in deps]) def triples(self, node=None): """ Extract dependency triples of the form: ((head word, head tag), rel, (dep word, dep tag)) """ if not node: node = self.root head = (node['word'], node['ctag']) for i in sorted(chain.from_iterable(node['deps'].values())): dep = self.get_by_address(i) yield (head, dep['rel'], (dep['word'], dep['ctag'])) for triple in self.triples(node=dep): yield triple def _hd(self, i): try: return self.nodes[i]['head'] except IndexError: return None def _rel(self, i): try: return self.nodes[i]['rel'] except IndexError: return None # what's the return type? Boolean or list? def contains_cycle(self): """Check whether there are cycles. >>> dg = DependencyGraph(treebank_data) >>> dg.contains_cycle() False >>> cyclic_dg = DependencyGraph() >>> top = {'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0} >>> child1 = {'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1} >>> child2 = {'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2} >>> child3 = {'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3} >>> child4 = {'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4} >>> cyclic_dg.nodes = { ... 0: top, ... 1: child1, ... 2: child2, ... 3: child3, ... 4: child4, ... } >>> cyclic_dg.root = top >>> cyclic_dg.contains_cycle() [3, 1, 2, 4] """ distances = {} for node in self.nodes.values(): for dep in node['deps']: key = tuple([node['address'], dep]) distances[key] = 1 for _ in self.nodes: new_entries = {} for pair1 in distances: for pair2 in distances: if pair1[1] == pair2[0]: key = tuple([pair1[0], pair2[1]]) new_entries[key] = distances[pair1] + distances[pair2] for pair in new_entries: distances[pair] = new_entries[pair] if pair[0] == pair[1]: path = self.get_cycle_path(self.get_by_address(pair[0]), pair[0]) return path return False # return []? def get_cycle_path(self, curr_node, goal_node_index): for dep in curr_node['deps']: if dep == goal_node_index: return [curr_node['address']] for dep in curr_node['deps']: path = self.get_cycle_path(self.get_by_address(dep), goal_node_index) if len(path) > 0: path.insert(0, curr_node['address']) return path return [] def to_conll(self, style): """ The dependency graph in CoNLL format. :param style: the style to use for the format (3, 4, 10 columns) :type style: int :rtype: str """ if style == 3: template = '{word}\t{tag}\t{head}\n' elif style == 4: template = '{word}\t{tag}\t{head}\t{rel}\n' elif style == 10: template = '{i}\t{word}\t{lemma}\t{ctag}\t{tag}\t{feats}\t{head}\t{rel}\t_\t_\n' else: raise ValueError( 'Number of tab-delimited fields ({0}) not supported by ' 'CoNLL(10) or Malt-Tab(4) format'.format(style) ) return ''.join(template.format(i=i, **node) for i, node in sorted(self.nodes.items()) if node['tag'] != 'TOP') def nx_graph(self): """Convert the data in a ``nodelist`` into a networkx labeled directed graph.""" import networkx nx_nodelist = list(range(1, len(self.nodes))) nx_edgelist = [ (n, self._hd(n), self._rel(n)) for n in nx_nodelist if self._hd(n) ] self.nx_labels = {} for n in nx_nodelist: self.nx_labels[n] = self.nodes[n]['word'] g = networkx.MultiDiGraph() g.add_nodes_from(nx_nodelist) g.add_edges_from(nx_edgelist) return g class DependencyGraphError(Exception): """Dependency graph exception.""" def demo(): malt_demo() conll_demo() conll_file_demo() cycle_finding_demo() def malt_demo(nx=False): """ A demonstration of the result of reading a dependency version of the first sentence of the Penn Treebank. """ dg = DependencyGraph("""Pierre NNP 2 NMOD Vinken NNP 8 SUB , , 2 P 61 CD 5 NMOD years NNS 6 AMOD old JJ 2 NMOD , , 2 P will MD 0 ROOT join VB 8 VC the DT 11 NMOD board NN 9 OBJ as IN 9 VMOD a DT 15 NMOD nonexecutive JJ 15 NMOD director NN 12 PMOD Nov. NNP 9 VMOD 29 CD 16 NMOD . . 9 VMOD """) tree = dg.tree() tree.pprint() if nx: # currently doesn't work import networkx from matplotlib import pylab g = dg.nx_graph() g.info() pos = networkx.spring_layout(g, dim=1) networkx.draw_networkx_nodes(g, pos, node_size=50) # networkx.draw_networkx_edges(g, pos, edge_color='k', width=8) networkx.draw_networkx_labels(g, pos, dg.nx_labels) pylab.xticks([]) pylab.yticks([]) pylab.savefig('tree.png') pylab.show() def conll_demo(): """ A demonstration of how to read a string representation of a CoNLL format dependency tree. """ dg = DependencyGraph(conll_data1) tree = dg.tree() tree.pprint() print(dg) print(dg.to_conll(4)) def conll_file_demo(): print('Mass conll_read demo...') graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry] for graph in graphs: tree = graph.tree() print('\n') tree.pprint() def cycle_finding_demo(): dg = DependencyGraph(treebank_data) print(dg.contains_cycle()) cyclic_dg = DependencyGraph() cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'TOP', 'address': 0}) cyclic_dg.add_node({'word': None, 'deps': [2], 'rel': 'NTOP', 'address': 1}) cyclic_dg.add_node({'word': None, 'deps': [4], 'rel': 'NTOP', 'address': 2}) cyclic_dg.add_node({'word': None, 'deps': [1], 'rel': 'NTOP', 'address': 3}) cyclic_dg.add_node({'word': None, 'deps': [3], 'rel': 'NTOP', 'address': 4}) print(cyclic_dg.contains_cycle()) treebank_data = """Pierre NNP 2 NMOD Vinken NNP 8 SUB , , 2 P 61 CD 5 NMOD years NNS 6 AMOD old JJ 2 NMOD , , 2 P will MD 0 ROOT join VB 8 VC the DT 11 NMOD board NN 9 OBJ as IN 9 VMOD a DT 15 NMOD nonexecutive JJ 15 NMOD director NN 12 PMOD Nov. NNP 9 VMOD 29 CD 16 NMOD . . 9 VMOD """ conll_data1 = """ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 met met Prep Prep voor 8 mod _ _ 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ 7 gaan ga V V hulp|inf 6 vc _ _ 8 winkelen winkel V V intrans|inf 11 cnj _ _ 9 , , Punc Punc komma 8 punct _ _ 10 zwemmen zwem V V intrans|inf 11 cnj _ _ 11 of of Conj Conj neven 7 vc _ _ 12 terrassen terras N N soort|mv|neut 11 cnj _ _ 13 . . Punc Punc punt 12 punct _ _ """ conll_data2 = """1 Cathy Cathy N N eigen|ev|neut 2 su _ _ 2 zag zie V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 hen hen Pron Pron per|3|mv|datofacc 2 obj1 _ _ 4 wild wild Adj Adj attr|stell|onverv 5 mod _ _ 5 zwaaien zwaai N N soort|mv|neut 2 vc _ _ 6 . . Punc Punc punt 5 punct _ _ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 had heb V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 met met Prep Prep voor 8 mod _ _ 4 haar haar Pron Pron bez|3|ev|neut|attr 5 det _ _ 5 moeder moeder N N soort|ev|neut 3 obj1 _ _ 6 kunnen kan V V hulp|ott|1of2of3|mv 2 vc _ _ 7 gaan ga V V hulp|inf 6 vc _ _ 8 winkelen winkel V V intrans|inf 11 cnj _ _ 9 , , Punc Punc komma 8 punct _ _ 10 zwemmen zwem V V intrans|inf 11 cnj _ _ 11 of of Conj Conj neven 7 vc _ _ 12 terrassen terras N N soort|mv|neut 11 cnj _ _ 13 . . Punc Punc punt 12 punct _ _ 1 Dat dat Pron Pron aanw|neut|attr 2 det _ _ 2 werkwoord werkwoord N N soort|ev|neut 6 obj1 _ _ 3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ 4 ze ze Pron Pron per|3|evofmv|nom 6 su _ _ 5 zelf zelf Pron Pron aanw|neut|attr|wzelf 3 predm _ _ 6 uitgevonden vind V V trans|verldw|onverv 3 vc _ _ 7 . . Punc Punc punt 6 punct _ _ 1 Het het Pron Pron onbep|neut|zelfst 2 su _ _ 2 hoorde hoor V V trans|ovt|1of2of3|ev 0 ROOT _ _ 3 bij bij Prep Prep voor 2 ld _ _ 4 de de Art Art bep|zijdofmv|neut 6 det _ _ 5 warme warm Adj Adj attr|stell|vervneut 6 mod _ _ 6 zomerdag zomerdag N N soort|ev|neut 3 obj1 _ _ 7 die die Pron Pron betr|neut|zelfst 6 mod _ _ 8 ze ze Pron Pron per|3|evofmv|nom 12 su _ _ 9 ginds ginds Adv Adv gew|aanw 12 mod _ _ 10 achter achter Adv Adv gew|geenfunc|stell|onverv 12 svp _ _ 11 had heb V V hulp|ovt|1of2of3|ev 7 body _ _ 12 gelaten laat V V trans|verldw|onverv 11 vc _ _ 13 . . Punc Punc punt 12 punct _ _ 1 Ze ze Pron Pron per|3|evofmv|nom 2 su _ _ 2 hadden heb V V trans|ovt|1of2of3|mv 0 ROOT _ _ 3 languit languit Adv Adv gew|geenfunc|stell|onverv 11 mod _ _ 4 naast naast Prep Prep voor 11 mod _ _ 5 elkaar elkaar Pron Pron rec|neut 4 obj1 _ _ 6 op op Prep Prep voor 11 ld _ _ 7 de de Art Art bep|zijdofmv|neut 8 det _ _ 8 strandstoelen strandstoel N N soort|mv|neut 6 obj1 _ _ 9 kunnen kan V V hulp|inf 2 vc _ _ 10 gaan ga V V hulp|inf 9 vc _ _ 11 liggen lig V V intrans|inf 10 vc _ _ 12 . . Punc Punc punt 11 punct _ _ 1 Zij zij Pron Pron per|3|evofmv|nom 2 su _ _ 2 zou zal V V hulp|ovt|1of2of3|ev 7 cnj _ _ 3 mams mams N N soort|ev|neut 4 det _ _ 4 rug rug N N soort|ev|neut 5 obj1 _ _ 5 ingewreven wrijf V V trans|verldw|onverv 6 vc _ _ 6 hebben heb V V hulp|inf 2 vc _ _ 7 en en Conj Conj neven 0 ROOT _ _ 8 mam mam V V trans|ovt|1of2of3|ev 7 cnj _ _ 9 de de Art Art bep|zijdofmv|neut 10 det _ _ 10 hare hare Pron Pron bez|3|ev|neut|attr 8 obj1 _ _ 11 . . Punc Punc punt 10 punct _ _ 1 Of of Conj Conj onder|metfin 0 ROOT _ _ 2 ze ze Pron Pron per|3|evofmv|nom 3 su _ _ 3 had heb V V hulp|ovt|1of2of3|ev 0 ROOT _ _ 4 gewoon gewoon Adj Adj adv|stell|onverv 10 mod _ _ 5 met met Prep Prep voor 10 mod _ _ 6 haar haar Pron Pron bez|3|ev|neut|attr 7 det _ _ 7 vriendinnen vriendin N N soort|mv|neut 5 obj1 _ _ 8 rond rond Adv Adv deelv 10 svp _ _ 9 kunnen kan V V hulp|inf 3 vc _ _ 10 slenteren slenter V V intrans|inf 9 vc _ _ 11 in in Prep Prep voor 10 mod _ _ 12 de de Art Art bep|zijdofmv|neut 13 det _ _ 13 buurt buurt N N soort|ev|neut 11 obj1 _ _ 14 van van Prep Prep voor 13 mod _ _ 15 Trafalgar_Square Trafalgar_Square MWU N_N eigen|ev|neut_eigen|ev|neut 14 obj1 _ _ 16 . . Punc Punc punt 15 punct _ _ """ if __name__ == '__main__': demo() nltk-3.1/nltk/parse/earleychart.py0000644000076500000240000004440212607224144017021 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: An Incremental Earley Chart Parser # # Copyright (C) 2001-2015 NLTK Project # Author: Peter Ljunglöf # Rob Speer # Edward Loper # Steven Bird # Jean Mark Gawron # URL: # For license information, see LICENSE.TXT """ Data classes and parser implementations for *incremental* chart parsers, which use dynamic programming to efficiently parse a text. A "chart parser" derives parse trees for a text by iteratively adding \"edges\" to a \"chart\". Each "edge" represents a hypothesis about the tree structure for a subsequence of the text. The "chart" is a \"blackboard\" for composing and combining these hypotheses. A parser is "incremental", if it guarantees that for all i, j where i < j, all edges ending at i are built before any edges ending at j. This is appealing for, say, speech recognizer hypothesis filtering. The main parser class is ``EarleyChartParser``, which is a top-down algorithm, originally formulated by Jay Earley (1970). """ from __future__ import print_function, division from nltk.compat import xrange from nltk.parse.chart import (Chart, ChartParser, EdgeI, LeafEdge, LeafInitRule, BottomUpPredictRule, BottomUpPredictCombineRule, TopDownInitRule, SingleEdgeFundamentalRule, EmptyPredictRule, CachedTopDownPredictRule, FilteredSingleEdgeFundamentalRule, FilteredBottomUpPredictCombineRule) from nltk.parse.featurechart import (FeatureChart, FeatureChartParser, FeatureTopDownInitRule, FeatureTopDownPredictRule, FeatureEmptyPredictRule, FeatureBottomUpPredictRule, FeatureBottomUpPredictCombineRule, FeatureSingleEdgeFundamentalRule) #//////////////////////////////////////////////////////////// # Incremental Chart #//////////////////////////////////////////////////////////// class IncrementalChart(Chart): def initialize(self): # A sequence of edge lists contained in this chart. self._edgelists = tuple([] for x in self._positions()) # The set of child pointer lists associated with each edge. self._edge_to_cpls = {} # Indexes mapping attribute values to lists of edges # (used by select()). self._indexes = {} def edges(self): return list(self.iteredges()) def iteredges(self): return (edge for edgelist in self._edgelists for edge in edgelist) def select(self, end, **restrictions): edgelist = self._edgelists[end] # If there are no restrictions, then return all edges. if restrictions=={}: return iter(edgelist) # Find the index corresponding to the given restrictions. restr_keys = sorted(restrictions.keys()) restr_keys = tuple(restr_keys) # If it doesn't exist, then create it. if restr_keys not in self._indexes: self._add_index(restr_keys) vals = tuple(restrictions[key] for key in restr_keys) return iter(self._indexes[restr_keys][end].get(vals, [])) def _add_index(self, restr_keys): # Make sure it's a valid index. for key in restr_keys: if not hasattr(EdgeI, key): raise ValueError('Bad restriction: %s' % key) # Create the index. index = self._indexes[restr_keys] = tuple({} for x in self._positions()) # Add all existing edges to the index. for end, edgelist in enumerate(self._edgelists): this_index = index[end] for edge in edgelist: vals = tuple(getattr(edge, key)() for key in restr_keys) this_index.setdefault(vals, []).append(edge) def _register_with_indexes(self, edge): end = edge.end() for (restr_keys, index) in self._indexes.items(): vals = tuple(getattr(edge, key)() for key in restr_keys) index[end].setdefault(vals, []).append(edge) def _append_edge(self, edge): self._edgelists[edge.end()].append(edge) def _positions(self): return xrange(self.num_leaves() + 1) class FeatureIncrementalChart(IncrementalChart, FeatureChart): def select(self, end, **restrictions): edgelist = self._edgelists[end] # If there are no restrictions, then return all edges. if restrictions=={}: return iter(edgelist) # Find the index corresponding to the given restrictions. restr_keys = sorted(restrictions.keys()) restr_keys = tuple(restr_keys) # If it doesn't exist, then create it. if restr_keys not in self._indexes: self._add_index(restr_keys) vals = tuple(self._get_type_if_possible(restrictions[key]) for key in restr_keys) return iter(self._indexes[restr_keys][end].get(vals, [])) def _add_index(self, restr_keys): # Make sure it's a valid index. for key in restr_keys: if not hasattr(EdgeI, key): raise ValueError('Bad restriction: %s' % key) # Create the index. index = self._indexes[restr_keys] = tuple({} for x in self._positions()) # Add all existing edges to the index. for end, edgelist in enumerate(self._edgelists): this_index = index[end] for edge in edgelist: vals = tuple(self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys) this_index.setdefault(vals, []).append(edge) def _register_with_indexes(self, edge): end = edge.end() for (restr_keys, index) in self._indexes.items(): vals = tuple(self._get_type_if_possible(getattr(edge, key)()) for key in restr_keys) index[end].setdefault(vals, []).append(edge) #//////////////////////////////////////////////////////////// # Incremental CFG Rules #//////////////////////////////////////////////////////////// class CompleteFundamentalRule(SingleEdgeFundamentalRule): def _apply_incomplete(self, chart, grammar, left_edge): end = left_edge.end() # When the chart is incremental, we only have to look for # empty complete edges here. for right_edge in chart.select(start=end, end=end, is_complete=True, lhs=left_edge.nextsym()): new_edge = left_edge.move_dot_forward(right_edge.end()) if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge class CompleterRule(CompleteFundamentalRule): _fundamental_rule = CompleteFundamentalRule() def apply(self, chart, grammar, edge): if not isinstance(edge, LeafEdge): for new_edge in self._fundamental_rule.apply(chart, grammar, edge): yield new_edge class ScannerRule(CompleteFundamentalRule): _fundamental_rule = CompleteFundamentalRule() def apply(self, chart, grammar, edge): if isinstance(edge, LeafEdge): for new_edge in self._fundamental_rule.apply(chart, grammar, edge): yield new_edge class PredictorRule(CachedTopDownPredictRule): pass class FilteredCompleteFundamentalRule(FilteredSingleEdgeFundamentalRule): def apply(self, chart, grammar, edge): # Since the Filtered rule only works for grammars without empty productions, # we only have to bother with complete edges here. if edge.is_complete(): for new_edge in self._apply_complete(chart, grammar, edge): yield new_edge #//////////////////////////////////////////////////////////// # Incremental FCFG Rules #//////////////////////////////////////////////////////////// class FeatureCompleteFundamentalRule(FeatureSingleEdgeFundamentalRule): def _apply_incomplete(self, chart, grammar, left_edge): fr = self._fundamental_rule end = left_edge.end() # When the chart is incremental, we only have to look for # empty complete edges here. for right_edge in chart.select(start=end, end=end, is_complete=True, lhs=left_edge.nextsym()): for new_edge in fr.apply(chart, grammar, left_edge, right_edge): yield new_edge class FeatureCompleterRule(CompleterRule): _fundamental_rule = FeatureCompleteFundamentalRule() class FeatureScannerRule(ScannerRule): _fundamental_rule = FeatureCompleteFundamentalRule() class FeaturePredictorRule(FeatureTopDownPredictRule): pass #//////////////////////////////////////////////////////////// # Incremental CFG Chart Parsers #//////////////////////////////////////////////////////////// EARLEY_STRATEGY = [LeafInitRule(), TopDownInitRule(), CompleterRule(), ScannerRule(), PredictorRule()] TD_INCREMENTAL_STRATEGY = [LeafInitRule(), TopDownInitRule(), CachedTopDownPredictRule(), CompleteFundamentalRule()] BU_INCREMENTAL_STRATEGY = [LeafInitRule(), EmptyPredictRule(), BottomUpPredictRule(), CompleteFundamentalRule()] BU_LC_INCREMENTAL_STRATEGY = [LeafInitRule(), EmptyPredictRule(), BottomUpPredictCombineRule(), CompleteFundamentalRule()] LC_INCREMENTAL_STRATEGY = [LeafInitRule(), FilteredBottomUpPredictCombineRule(), FilteredCompleteFundamentalRule()] class IncrementalChartParser(ChartParser): """ An *incremental* chart parser implementing Jay Earley's parsing algorithm: | For each index end in [0, 1, ..., N]: | For each edge such that edge.end = end: | If edge is incomplete and edge.next is not a part of speech: | Apply PredictorRule to edge | If edge is incomplete and edge.next is a part of speech: | Apply ScannerRule to edge | If edge is complete: | Apply CompleterRule to edge | Return any complete parses in the chart """ def __init__(self, grammar, strategy=BU_LC_INCREMENTAL_STRATEGY, trace=0, trace_chart_width=50, chart_class=IncrementalChart): """ Create a new Earley chart parser, that uses ``grammar`` to parse texts. :type grammar: CFG :param grammar: The grammar used to parse texts. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. :type trace_chart_width: int :param trace_chart_width: The default total width reserved for the chart in trace output. The remainder of each line will be used to display edges. :param chart_class: The class that should be used to create the charts used by this parser. """ self._grammar = grammar self._trace = trace self._trace_chart_width = trace_chart_width self._chart_class = chart_class self._axioms = [] self._inference_rules = [] for rule in strategy: if rule.NUM_EDGES == 0: self._axioms.append(rule) elif rule.NUM_EDGES == 1: self._inference_rules.append(rule) else: raise ValueError("Incremental inference rules must have " "NUM_EDGES == 0 or 1") def chart_parse(self, tokens, trace=None): if trace is None: trace = self._trace trace_new_edges = self._trace_new_edges tokens = list(tokens) self._grammar.check_coverage(tokens) chart = self._chart_class(tokens) grammar = self._grammar # Width, for printing trace edges. trace_edge_width = self._trace_chart_width // (chart.num_leaves() + 1) if trace: print(chart.pretty_format_leaves(trace_edge_width)) for axiom in self._axioms: new_edges = list(axiom.apply(chart, grammar)) trace_new_edges(chart, axiom, new_edges, trace, trace_edge_width) inference_rules = self._inference_rules for end in range(chart.num_leaves()+1): if trace > 1: print("\n* Processing queue:", end, "\n") agenda = list(chart.select(end=end)) while agenda: edge = agenda.pop() for rule in inference_rules: new_edges = list(rule.apply(chart, grammar, edge)) trace_new_edges(chart, rule, new_edges, trace, trace_edge_width) for new_edge in new_edges: if new_edge.end()==end: agenda.append(new_edge) return chart class EarleyChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__(self, grammar, EARLEY_STRATEGY, **parser_args) pass class IncrementalTopDownChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__(self, grammar, TD_INCREMENTAL_STRATEGY, **parser_args) class IncrementalBottomUpChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__(self, grammar, BU_INCREMENTAL_STRATEGY, **parser_args) class IncrementalBottomUpLeftCornerChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): IncrementalChartParser.__init__(self, grammar, BU_LC_INCREMENTAL_STRATEGY, **parser_args) class IncrementalLeftCornerChartParser(IncrementalChartParser): def __init__(self, grammar, **parser_args): if not grammar.is_nonempty(): raise ValueError("IncrementalLeftCornerParser only works for grammars " "without empty productions.") IncrementalChartParser.__init__(self, grammar, LC_INCREMENTAL_STRATEGY, **parser_args) #//////////////////////////////////////////////////////////// # Incremental FCFG Chart Parsers #//////////////////////////////////////////////////////////// EARLEY_FEATURE_STRATEGY = [LeafInitRule(), FeatureTopDownInitRule(), FeatureCompleterRule(), FeatureScannerRule(), FeaturePredictorRule()] TD_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(), FeatureTopDownInitRule(), FeatureTopDownPredictRule(), FeatureCompleteFundamentalRule()] BU_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictRule(), FeatureCompleteFundamentalRule()] BU_LC_INCREMENTAL_FEATURE_STRATEGY = [LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictCombineRule(), FeatureCompleteFundamentalRule()] class FeatureIncrementalChartParser(IncrementalChartParser, FeatureChartParser): def __init__(self, grammar, strategy=BU_LC_INCREMENTAL_FEATURE_STRATEGY, trace_chart_width=20, chart_class=FeatureIncrementalChart, **parser_args): IncrementalChartParser.__init__(self, grammar, strategy=strategy, trace_chart_width=trace_chart_width, chart_class=chart_class, **parser_args) class FeatureEarleyChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__(self, grammar, EARLEY_FEATURE_STRATEGY, **parser_args) class FeatureIncrementalTopDownChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__(self, grammar, TD_INCREMENTAL_FEATURE_STRATEGY, **parser_args) class FeatureIncrementalBottomUpChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__(self, grammar, BU_INCREMENTAL_FEATURE_STRATEGY, **parser_args) class FeatureIncrementalBottomUpLeftCornerChartParser(FeatureIncrementalChartParser): def __init__(self, grammar, **parser_args): FeatureIncrementalChartParser.__init__(self, grammar, BU_LC_INCREMENTAL_FEATURE_STRATEGY, **parser_args) #//////////////////////////////////////////////////////////// # Demonstration #//////////////////////////////////////////////////////////// def demo(print_times=True, print_grammar=False, print_trees=True, trace=2, sent='I saw John with a dog with my cookie', numparses=5): """ A demonstration of the Earley parsers. """ import sys, time from nltk.parse.chart import demo_grammar # The grammar for ChartParser and SteppingChartParser: grammar = demo_grammar() if print_grammar: print("* Grammar") print(grammar) # Tokenize the sample sentence. print("* Sentence:") print(sent) tokens = sent.split() print(tokens) print() # Do the parsing. earley = EarleyChartParser(grammar, trace=trace) t = time.clock() chart = earley.chart_parse(tokens) parses = list(chart.parses(grammar.start())) t = time.clock()-t # Print results. if numparses: assert len(parses)==numparses, 'Not all parses found' if print_trees: for tree in parses: print(tree) else: print("Nr trees:", len(parses)) if print_times: print("Time:", t) if __name__ == '__main__': demo() nltk-3.1/nltk/parse/evaluate.py0000644000076500000240000001036312607224144016323 0ustar sbstaff00000000000000# Natural Language Toolkit: evaluation of dependency parser # # Author: Long Duong # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import division import unicodedata class DependencyEvaluator(object): """ Class for measuring labelled and unlabelled attachment score for dependency parsing. Note that the evaluation ignores punctuation. >>> from nltk.parse import DependencyGraph, DependencyEvaluator >>> gold_sent = DependencyGraph(\""" ... Pierre NNP 2 NMOD ... Vinken NNP 8 SUB ... , , 2 P ... 61 CD 5 NMOD ... years NNS 6 AMOD ... old JJ 2 NMOD ... , , 2 P ... will MD 0 ROOT ... join VB 8 VC ... the DT 11 NMOD ... board NN 9 OBJ ... as IN 9 VMOD ... a DT 15 NMOD ... nonexecutive JJ 15 NMOD ... director NN 12 PMOD ... Nov. NNP 9 VMOD ... 29 CD 16 NMOD ... . . 9 VMOD ... \""") >>> parsed_sent = DependencyGraph(\""" ... Pierre NNP 8 NMOD ... Vinken NNP 1 SUB ... , , 3 P ... 61 CD 6 NMOD ... years NNS 6 AMOD ... old JJ 2 NMOD ... , , 3 AMOD ... will MD 0 ROOT ... join VB 8 VC ... the DT 11 AMOD ... board NN 9 OBJECT ... as IN 9 NMOD ... a DT 15 NMOD ... nonexecutive JJ 15 NMOD ... director NN 12 PMOD ... Nov. NNP 9 VMOD ... 29 CD 16 NMOD ... . . 9 VMOD ... \""") >>> de = DependencyEvaluator([parsed_sent],[gold_sent]) >>> las, uas = de.eval() >>> las 0.8... >>> abs(uas - 0.6) < 0.00001 True """ def __init__(self, parsed_sents, gold_sents): """ :param parsed_sents: the list of parsed_sents as the output of parser :type parsed_sents: list(DependencyGraph) """ self._parsed_sents = parsed_sents self._gold_sents = gold_sents def _remove_punct(self, inStr): """ Function to remove punctuation from Unicode string. :param input: the input string :return: Unicode string after remove all punctuation """ punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"]) return "".join(x for x in inStr if unicodedata.category(x) not in punc_cat) def eval(self): """ Return the Labeled Attachment Score (LAS) and Unlabeled Attachment Score (UAS) :return : tuple(float,float) """ if (len(self._parsed_sents) != len(self._gold_sents)): raise ValueError(" Number of parsed sentence is different with number of gold sentence.") corr = 0 corrL = 0 total = 0 for i in range(len(self._parsed_sents)): parsed_sent_nodes = self._parsed_sents[i].nodes gold_sent_nodes = self._gold_sents[i].nodes if (len(parsed_sent_nodes) != len(gold_sent_nodes)): raise ValueError("Sentences must have equal length.") for parsed_node_address, parsed_node in parsed_sent_nodes.items(): gold_node = gold_sent_nodes[parsed_node_address] if parsed_node["word"] is None: continue if parsed_node["word"] != gold_node["word"]: raise ValueError("Sentence sequence is not matched.") # Ignore if word is punctuation by default # if (parsed_sent[j]["word"] in string.punctuation): if self._remove_punct(parsed_node["word"]) == "": continue total += 1 if parsed_node["head"] == gold_node["head"]: corr += 1 if parsed_node["rel"] == gold_node["rel"]: corrL += 1 return corr / total, corrL / total nltk-3.1/nltk/parse/featurechart.py0000644000076500000240000005323512607224144017177 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Chart Parser for Feature-Based Grammars # # Copyright (C) 2001-2015 NLTK Project # Author: Rob Speer # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT """ Extension of chart parsing implementation to handle grammars with feature structures as nodes. """ from __future__ import print_function, unicode_literals from nltk.compat import xrange, python_2_unicode_compatible from nltk.featstruct import FeatStruct, unify, TYPE, find_variables from nltk.sem import logic from nltk.tree import Tree from nltk.grammar import (Nonterminal, Production, CFG, FeatStructNonterminal, is_nonterminal, is_terminal) from nltk.parse.chart import (TreeEdge, Chart, ChartParser, EdgeI, FundamentalRule, LeafInitRule, EmptyPredictRule, BottomUpPredictRule, SingleEdgeFundamentalRule, BottomUpPredictCombineRule, CachedTopDownPredictRule, TopDownInitRule) #//////////////////////////////////////////////////////////// # Tree Edge #//////////////////////////////////////////////////////////// @python_2_unicode_compatible class FeatureTreeEdge(TreeEdge): """ A specialized tree edge that allows shared variable bindings between nonterminals on the left-hand side and right-hand side. Each ``FeatureTreeEdge`` contains a set of ``bindings``, i.e., a dictionary mapping from variables to values. If the edge is not complete, then these bindings are simply stored. However, if the edge is complete, then the constructor applies these bindings to every nonterminal in the edge whose symbol implements the interface ``SubstituteBindingsI``. """ def __init__(self, span, lhs, rhs, dot=0, bindings=None): """ Construct a new edge. If the edge is incomplete (i.e., if ``dot alpha \* B1 beta][i:j]`` - ``[B2 -> gamma \*][j:k]`` licenses the edge: - ``[A -> alpha B3 \* beta][i:j]`` assuming that B1 and B2 can be unified to generate B3. """ def apply(self, chart, grammar, left_edge, right_edge): # Make sure the rule is applicable. if not (left_edge.end() == right_edge.start() and left_edge.is_incomplete() and right_edge.is_complete() and isinstance(left_edge, FeatureTreeEdge)): return found = right_edge.lhs() nextsym = left_edge.nextsym() if isinstance(right_edge, FeatureTreeEdge): if not is_nonterminal(nextsym): return if left_edge.nextsym()[TYPE] != right_edge.lhs()[TYPE]: return # Create a copy of the bindings. bindings = left_edge.bindings() # We rename vars here, because we don't want variables # from the two different productions to match. found = found.rename_variables(used_vars=left_edge.variables()) # Unify B1 (left_edge.nextsym) with B2 (right_edge.lhs) to # generate B3 (result). result = unify(nextsym, found, bindings, rename_vars=False) if result is None: return else: if nextsym != found: return # Create a copy of the bindings. bindings = left_edge.bindings() # Construct the new edge. new_edge = left_edge.move_dot_forward(right_edge.end(), bindings) # Add it to the chart, with appropriate child pointers. if chart.insert_with_backpointer(new_edge, left_edge, right_edge): yield new_edge class FeatureSingleEdgeFundamentalRule(SingleEdgeFundamentalRule): """ A specialized version of the completer / single edge fundamental rule that operates on nonterminals whose symbols are ``FeatStructNonterminal``s. Rather than simply comparing the nonterminals for equality, they are unified. """ _fundamental_rule = FeatureFundamentalRule() def _apply_complete(self, chart, grammar, right_edge): fr = self._fundamental_rule for left_edge in chart.select(end=right_edge.start(), is_complete=False, nextsym=right_edge.lhs()): for new_edge in fr.apply(chart, grammar, left_edge, right_edge): yield new_edge def _apply_incomplete(self, chart, grammar, left_edge): fr = self._fundamental_rule for right_edge in chart.select(start=left_edge.end(), is_complete=True, lhs=left_edge.nextsym()): for new_edge in fr.apply(chart, grammar, left_edge, right_edge): yield new_edge #//////////////////////////////////////////////////////////// # Top-Down Prediction #//////////////////////////////////////////////////////////// class FeatureTopDownInitRule(TopDownInitRule): def apply(self, chart, grammar): for prod in grammar.productions(lhs=grammar.start()): new_edge = FeatureTreeEdge.from_production(prod, 0) if chart.insert(new_edge, ()): yield new_edge class FeatureTopDownPredictRule(CachedTopDownPredictRule): """ A specialized version of the (cached) top down predict rule that operates on nonterminals whose symbols are ``FeatStructNonterminal``s. Rather than simply comparing the nonterminals for equality, they are unified. The top down expand rule states that: - ``[A -> alpha \* B1 beta][i:j]`` licenses the edge: - ``[B2 -> \* gamma][j:j]`` for each grammar production ``B2 -> gamma``, assuming that B1 and B2 can be unified. """ def apply(self, chart, grammar, edge): if edge.is_complete(): return nextsym, index = edge.nextsym(), edge.end() if not is_nonterminal(nextsym): return # If we've already applied this rule to an edge with the same # next & end, and the chart & grammar have not changed, then # just return (no new edges to add). nextsym_with_bindings = edge.next_with_bindings() done = self._done.get((nextsym_with_bindings, index), (None, None)) if done[0] is chart and done[1] is grammar: return for prod in grammar.productions(lhs=nextsym): # If the left corner in the predicted production is # leaf, it must match with the input. if prod.rhs(): first = prod.rhs()[0] if is_terminal(first): if index >= chart.num_leaves(): continue if first != chart.leaf(index): continue # We rename vars here, because we don't want variables # from the two different productions to match. if unify(prod.lhs(), nextsym_with_bindings, rename_vars=True): new_edge = FeatureTreeEdge.from_production(prod, edge.end()) if chart.insert(new_edge, ()): yield new_edge # Record the fact that we've applied this rule. self._done[nextsym_with_bindings, index] = (chart, grammar) #//////////////////////////////////////////////////////////// # Bottom-Up Prediction #//////////////////////////////////////////////////////////// class FeatureBottomUpPredictRule(BottomUpPredictRule): def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(rhs=edge.lhs()): if isinstance(edge, FeatureTreeEdge): _next = prod.rhs()[0] if not is_nonterminal(_next): continue new_edge = FeatureTreeEdge.from_production(prod, edge.start()) if chart.insert(new_edge, ()): yield new_edge class FeatureBottomUpPredictCombineRule(BottomUpPredictCombineRule): def apply(self, chart, grammar, edge): if edge.is_incomplete(): return found = edge.lhs() for prod in grammar.productions(rhs=found): bindings = {} if isinstance(edge, FeatureTreeEdge): _next = prod.rhs()[0] if not is_nonterminal(_next): continue # We rename vars here, because we don't want variables # from the two different productions to match. used_vars = find_variables((prod.lhs(),) + prod.rhs(), fs_class=FeatStruct) found = found.rename_variables(used_vars=used_vars) result = unify(_next, found, bindings, rename_vars=False) if result is None: continue new_edge = (FeatureTreeEdge.from_production(prod, edge.start()) .move_dot_forward(edge.end(), bindings)) if chart.insert(new_edge, (edge,)): yield new_edge class FeatureEmptyPredictRule(EmptyPredictRule): def apply(self, chart, grammar): for prod in grammar.productions(empty=True): for index in xrange(chart.num_leaves() + 1): new_edge = FeatureTreeEdge.from_production(prod, index) if chart.insert(new_edge, ()): yield new_edge #//////////////////////////////////////////////////////////// # Feature Chart Parser #//////////////////////////////////////////////////////////// TD_FEATURE_STRATEGY = [LeafInitRule(), FeatureTopDownInitRule(), FeatureTopDownPredictRule(), FeatureSingleEdgeFundamentalRule()] BU_FEATURE_STRATEGY = [LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictRule(), FeatureSingleEdgeFundamentalRule()] BU_LC_FEATURE_STRATEGY = [LeafInitRule(), FeatureEmptyPredictRule(), FeatureBottomUpPredictCombineRule(), FeatureSingleEdgeFundamentalRule()] class FeatureChartParser(ChartParser): def __init__(self, grammar, strategy=BU_LC_FEATURE_STRATEGY, trace_chart_width=20, chart_class=FeatureChart, **parser_args): ChartParser.__init__(self, grammar, strategy=strategy, trace_chart_width=trace_chart_width, chart_class=chart_class, **parser_args) class FeatureTopDownChartParser(FeatureChartParser): def __init__(self, grammar, **parser_args): FeatureChartParser.__init__(self, grammar, TD_FEATURE_STRATEGY, **parser_args) class FeatureBottomUpChartParser(FeatureChartParser): def __init__(self, grammar, **parser_args): FeatureChartParser.__init__(self, grammar, BU_FEATURE_STRATEGY, **parser_args) class FeatureBottomUpLeftCornerChartParser(FeatureChartParser): def __init__(self, grammar, **parser_args): FeatureChartParser.__init__(self, grammar, BU_LC_FEATURE_STRATEGY, **parser_args) #//////////////////////////////////////////////////////////// # Instantiate Variable Chart #//////////////////////////////////////////////////////////// class InstantiateVarsChart(FeatureChart): """ A specialized chart that 'instantiates' variables whose names start with '@', by replacing them with unique new variables. In particular, whenever a complete edge is added to the chart, any variables in the edge's ``lhs`` whose names start with '@' will be replaced by unique new ``Variable``s. """ def __init__(self, tokens): FeatureChart.__init__(self, tokens) def initialize(self): self._instantiated = set() FeatureChart.initialize(self) def insert(self, edge, child_pointer_list): if edge in self._instantiated: return False self.instantiate_edge(edge) return FeatureChart.insert(self, edge, child_pointer_list) def instantiate_edge(self, edge): """ If the edge is a ``FeatureTreeEdge``, and it is complete, then instantiate all variables whose names start with '@', by replacing them with unique new variables. Note that instantiation is done in-place, since the parsing algorithms might already hold a reference to the edge for future use. """ # If the edge is a leaf, or is not complete, or is # already in the chart, then just return it as-is. if not isinstance(edge, FeatureTreeEdge): return if not edge.is_complete(): return if edge in self._edge_to_cpls: return # Get a list of variables that need to be instantiated. # If there are none, then return as-is. inst_vars = self.inst_vars(edge) if not inst_vars: return # Instantiate the edge! self._instantiated.add(edge) edge._lhs = edge.lhs().substitute_bindings(inst_vars) def inst_vars(self, edge): return dict((var, logic.unique_variable()) for var in edge.lhs().variables() if var.name.startswith('@')) #//////////////////////////////////////////////////////////// # Demo #//////////////////////////////////////////////////////////// def demo_grammar(): from nltk.grammar import FeatureGrammar return FeatureGrammar.fromstring(""" S -> NP VP PP -> Prep NP NP -> NP PP VP -> VP PP VP -> Verb NP VP -> Verb NP -> Det[pl=?x] Noun[pl=?x] NP -> "John" NP -> "I" Det -> "the" Det -> "my" Det[-pl] -> "a" Noun[-pl] -> "dog" Noun[-pl] -> "cookie" Verb -> "ate" Verb -> "saw" Prep -> "with" Prep -> "under" """) def demo(print_times=True, print_grammar=True, print_trees=True, print_sentence=True, trace=1, parser=FeatureChartParser, sent='I saw John with a dog with my cookie'): import sys, time print() grammar = demo_grammar() if print_grammar: print(grammar) print() print("*", parser.__name__) if print_sentence: print("Sentence:", sent) tokens = sent.split() t = time.clock() cp = parser(grammar, trace=trace) chart = cp.chart_parse(tokens) trees = list(chart.parses(grammar.start())) if print_times: print("Time: %s" % (time.clock() - t)) if print_trees: for tree in trees: print(tree) else: print("Nr trees:", len(trees)) def run_profile(): import profile profile.run('for i in range(1): demo()', '/tmp/profile.out') import pstats p = pstats.Stats('/tmp/profile.out') p.strip_dirs().sort_stats('time', 'cum').print_stats(60) p.strip_dirs().sort_stats('cum', 'time').print_stats(60) if __name__ == '__main__': from nltk.data import load demo() print() grammar = load('grammars/book_grammars/feat0.fcfg') cp = FeatureChartParser(grammar, trace=2) sent = 'Kim likes children' tokens = sent.split() trees = cp.parse(tokens) for tree in trees: print(tree) nltk-3.1/nltk/parse/generate.py0000644000076500000240000000407212607224144016307 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Generating from a CFG # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT # from __future__ import print_function import itertools import sys from nltk.grammar import Nonterminal def generate(grammar, start=None, depth=None, n=None): """ Generates an iterator of all sentences from a CFG. :param grammar: The Grammar used to generate sentences. :param start: The Nonterminal from which to start generate sentences. :param depth: The maximal depth of the generated tree. :param n: The maximum number of sentences to return. :return: An iterator of lists of terminal tokens. """ if not start: start = grammar.start() if depth is None: depth = sys.maxsize iter = _generate_all(grammar, [start], depth) if n: iter = itertools.islice(iter, n) return iter def _generate_all(grammar, items, depth): if items: for frag1 in _generate_one(grammar, items[0], depth): for frag2 in _generate_all(grammar, items[1:], depth): yield frag1 + frag2 else: yield [] def _generate_one(grammar, item, depth): if depth > 0: if isinstance(item, Nonterminal): for prod in grammar.productions(lhs=item): for frag in _generate_all(grammar, prod.rhs(), depth-1): yield frag else: yield [item] demo_grammar = """ S -> NP VP NP -> Det N PP -> P NP VP -> 'slept' | 'saw' NP | 'walked' PP Det -> 'the' | 'a' N -> 'man' | 'park' | 'dog' P -> 'in' | 'with' """ def demo(N=23): from nltk.grammar import CFG print('Generating the first %d sentences for demo grammar:' % (N,)) print(demo_grammar) grammar = CFG.fromstring(demo_grammar) for n, sent in enumerate(generate(grammar, n=N), 1): print('%3d. %s' % (n, ' '.join(sent))) if __name__ == '__main__': demo() nltk-3.1/nltk/parse/malt.py0000644000076500000240000003577712607224144015472 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Interface to MaltParser # # Author: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import print_function from __future__ import unicode_literals from nltk.six import text_type import os import tempfile import subprocess import inspect from nltk.data import ZipFilePathPointer from nltk.internals import find_dir, find_file, find_jars_within_path from nltk.parse.api import ParserI from nltk.parse.dependencygraph import DependencyGraph from nltk.parse.util import taggedsents_to_conll def malt_regex_tagger(): from nltk.tag import RegexpTagger _tagger = RegexpTagger( [(r'\.$','.'), (r'\,$',','), (r'\?$','?'), # fullstop, comma, Qmark (r'\($','('), (r'\)$',')'), # round brackets (r'\[$','['), (r'\]$',']'), # square brackets (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'DT'), # articles (r'(He|he|She|she|It|it|I|me|Me|You|you)$', 'PRP'), # pronouns (r'(His|his|Her|her|Its|its)$', 'PRP$'), # possesive (r'(my|Your|your|Yours|yours)$', 'PRP$'), # possesive (r'(on|On|in|In|at|At|since|Since)$', 'IN'),# time prepopsitions (r'(for|For|ago|Ago|before|Before)$', 'IN'),# time prepopsitions (r'(till|Till|until|Until)$', 'IN'), # time prepopsitions (r'(by|By|beside|Beside)$', 'IN'), # space prepopsitions (r'(under|Under|below|Below)$', 'IN'), # space prepopsitions (r'(over|Over|above|Above)$', 'IN'), # space prepopsitions (r'(across|Across|through|Through)$', 'IN'),# space prepopsitions (r'(into|Into|towards|Towards)$', 'IN'), # space prepopsitions (r'(onto|Onto|from|From)$', 'IN'), # space prepopsitions (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN'), # nouns (default) ]) return _tagger.tag def find_maltparser(parser_dirname): """ A module to find MaltParser .jar file and its dependencies. """ if os.path.exists(parser_dirname): # If a full path is given. _malt_dir = parser_dirname else: # Try to find path to maltparser directory in environment variables. _malt_dir = find_dir(parser_dirname, env_vars=('MALT_PARSER',)) # Checks that that the found directory contains all the necessary .jar malt_dependencies = ['','',''] _malt_jars = set(find_jars_within_path(_malt_dir)) _jars = set(jar.rpartition('/')[2] for jar in _malt_jars) malt_dependencies = set(['log4j.jar', 'libsvm.jar', 'liblinear-1.8.jar']) assert malt_dependencies.issubset(_jars) assert any(filter(lambda i: i.startswith('maltparser-') and i.endswith('.jar'), _jars)) return list(_malt_jars) def find_malt_model(model_filename): """ A module to find pre-trained MaltParser model. """ if model_filename == None: return 'malt_temp.mco' elif os.path.exists(model_filename): # If a full path is given. return model_filename else: # Try to find path to malt model in environment variables. return find_file(model_filename, env_vars=('MALT_MODEL',), verbose=False) class MaltParser(ParserI): """ A class for dependency parsing with MaltParser. The input is the paths to: - a maltparser directory - (optionally) the path to a pre-trained MaltParser .mco model file - (optionally) the tagger to use for POS tagging before parsing - (optionally) additional Java arguments Example: >>> from nltk.parse import malt >>> # With MALT_PARSER and MALT_MODEL environment set. >>> mp = malt.MaltParser('maltparser-1.7.2', 'engmalt.linear-1.7.mco') # doctest: +SKIP >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP (shot I (elephant an) (in (pajamas my)) .) >>> # Without MALT_PARSER and MALT_MODEL environment. >>> mp = malt.MaltParser('/home/user/maltparser-1.7.2/', '/home/user/engmalt.linear-1.7.mco') # doctest: +SKIP >>> mp.parse_one('I shot an elephant in my pajamas .'.split()).tree() # doctest: +SKIP (shot I (elephant an) (in (pajamas my)) .) """ def __init__(self, parser_dirname, model_filename=None, tagger=None, additional_java_args=None): """ An interface for parsing with the Malt Parser. :param parser_dirname: The path to the maltparser directory that contains the maltparser-1.x.jar :type parser_dirname: str :param model_filename: The name of the pre-trained model with .mco file extension. If provided, training will not be required. (see http://www.maltparser.org/mco/mco.html and see http://www.patful.com/chalk/node/185) :type model_filename: str :param tagger: The tagger used to POS tag the raw string before formatting to CONLL format. It should behave like `nltk.pos_tag` :type tagger: function :param additional_java_args: This is the additional Java arguments that one can use when calling Maltparser, usually this is the heapsize limits, e.g. `additional_java_args=['-Xmx1024m']` (see http://goo.gl/mpDBvQ) :type additional_java_args: list """ # Find all the necessary jar files for MaltParser. self.malt_jars = find_maltparser(parser_dirname) # Initialize additional java arguments. self.additional_java_args = additional_java_args if \ additional_java_args is not None else [] # Initialize model. self.model = find_malt_model(model_filename) self._trained = self.model != 'malt_temp.mco' # Set the working_dir parameters i.e. `-w` from MaltParser's option. self.working_dir = tempfile.gettempdir() # Initialize POS tagger. self.tagger = tagger if tagger is not None else malt_regex_tagger() def parse_tagged_sents(self, sentences, verbose=False, top_relation_label='null'): """ Use MaltParser to parse multiple POS tagged sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :return: iter(iter(``DependencyGraph``)) the dependency graph representation of each sentence """ if not self._trained: raise Exception("Parser has not been trained. Call train() first.") with tempfile.NamedTemporaryFile(prefix='malt_input.conll.', dir=self.working_dir, mode='w', delete=False) as input_file: with tempfile.NamedTemporaryFile(prefix='malt_output.conll.', dir=self.working_dir, mode='w', delete=False) as output_file: # Convert list of sentences to CONLL format. for line in taggedsents_to_conll(sentences): input_file.write(text_type(line)) input_file.close() # Generate command to run maltparser. cmd =self.generate_malt_command(input_file.name, output_file.name, mode="parse") # This is a maltparser quirk, it needs to be run # where the model file is. otherwise it goes into an awkward # missing .jars or strange -w working_dir problem. _current_path = os.getcwd() # Remembers the current path. try: # Change to modelfile path os.chdir(os.path.split(self.model)[0]) except: pass ret = self._execute(cmd, verbose) # Run command. os.chdir(_current_path) # Change back to current path. if ret is not 0: raise Exception("MaltParser parsing (%s) failed with exit " "code %d" % (' '.join(cmd), ret)) # Must return iter(iter(Tree)) with open(output_file.name) as infile: for tree_str in infile.read().split('\n\n'): yield(iter([DependencyGraph(tree_str, top_relation_label=top_relation_label)])) os.remove(input_file.name) os.remove(output_file.name) def parse_sents(self, sentences, verbose=False, top_relation_label='null'): """ Use MaltParser to parse multiple sentences. Takes a list of sentences, where each sentence is a list of words. Each sentence will be automatically tagged with this MaltParser instance's tagger. :param sentences: Input sentences to parse :type sentence: list(list(str)) :return: iter(DependencyGraph) """ tagged_sentences = (self.tagger(sentence) for sentence in sentences) return self.parse_tagged_sents(tagged_sentences, verbose, top_relation_label=top_relation_label) def generate_malt_command(self, inputfilename, outputfilename=None, mode=None): """ This function generates the maltparser command use at the terminal. :param inputfilename: path to the input file :type inputfilename: str :param outputfilename: path to the output file :type outputfilename: str """ cmd = ['java'] cmd+= self.additional_java_args # Adds additional java arguments. cmd+= ['-cp', ':'.join(self.malt_jars)] # Adds classpaths for jars cmd+= ['org.maltparser.Malt'] # Adds the main function. # Adds the model file. if os.path.exists(self.model): # when parsing cmd+= ['-c', os.path.split(self.model)[-1]] else: # when learning cmd+= ['-c', self.model] cmd+= ['-i', inputfilename] if mode == 'parse': cmd+= ['-o', outputfilename] cmd+= ['-m', mode] # mode use to generate parses. return cmd @staticmethod def _execute(cmd, verbose=False): output = None if verbose else subprocess.PIPE p = subprocess.Popen(cmd, stdout=output, stderr=output) return p.wait() def train(self, depgraphs, verbose=False): """ Train MaltParser from a list of ``DependencyGraph`` objects :param depgraphs: list of ``DependencyGraph`` objects for training input data :type depgraphs: DependencyGraph """ # Write the conll_str to malt_train.conll file in /tmp/ with tempfile.NamedTemporaryFile(prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False) as input_file: input_str = ('\n'.join(dg.to_conll(10) for dg in depgraphs)) input_file.write(text_type(input_str)) # Trains the model with the malt_train.conll self.train_from_file(input_file.name, verbose=verbose) # Removes the malt_train.conll once training finishes. os.remove(input_file.name) def train_from_file(self, conll_file, verbose=False): """ Train MaltParser from a file :param conll_file: str for the filename of the training input data :type conll_file: str """ # If conll_file is a ZipFilePathPointer, # then we need to do some extra massaging if isinstance(conll_file, ZipFilePathPointer): with tempfile.NamedTemporaryFile(prefix='malt_train.conll.', dir=self.working_dir, mode='w', delete=False) as input_file: with conll_file.open() as conll_input_file: conll_str = conll_input_file.read() input_file.write(text_type(conll_str)) return self.train_from_file(input_file.name, verbose=verbose) # Generate command to run maltparser. cmd =self.generate_malt_command(conll_file, mode="learn") ret = self._execute(cmd, verbose) if ret != 0: raise Exception("MaltParser training (%s) failed with exit " "code %d" % (' '.join(cmd), ret)) self._trained = True if __name__ == '__main__': ''' A demostration function to show how NLTK users can use the malt parser API. >>> from nltk import pos_tag >>> assert 'MALT_PARSER' in os.environ, str( ... "Please set MALT_PARSER in your global environment, e.g.:\n" ... "$ export MALT_PARSER='/home/user/maltparser-1.7.2/'") >>> >>> assert 'MALT_MODEL' in os.environ, str( ... "Please set MALT_MODEL in your global environment, e.g.:\n" ... "$ export MALT_MODEL='/home/user/engmalt.linear-1.7.mco'") >>> >>> _dg1_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" ... "2 sees _ VB _ _ 0 ROOT _ _\n" ... "3 a _ DT _ _ 4 SPEC _ _\n" ... "4 dog _ NN _ _ 2 OBJ _ _\n" ... "5 . _ . _ _ 2 PUNCT _ _\n") >>> >>> >>> _dg2_str = str("1 John _ NNP _ _ 2 SUBJ _ _\n" ... "2 walks _ VB _ _ 0 ROOT _ _\n" ... "3 . _ . _ _ 2 PUNCT _ _\n") >>> dg1 = DependencyGraph(_dg1_str) >>> dg2 = DependencyGraph(_dg2_str) >>> # Initialize a MaltParser object >>> parser_dirname = 'maltparser-1.7.2' >>> mp = MaltParser(parser_dirname=parser_dirname) >>> >>> # Trains a model. >>> mp.train([dg1,dg2], verbose=False) >>> sent1 = ['John','sees','Mary', '.'] >>> sent2 = ['John', 'walks', 'a', 'dog', '.'] >>> >>> # Parse a single sentence. >>> parsed_sent1 = mp.parse_one(sent1) >>> parsed_sent2 = mp.parse_one(sent2) >>> print (parsed_sent1.tree()) (sees John Mary .) >>> print (parsed_sent2.tree()) (walks John (dog a) .) >>> >>> # Parsing multiple sentences. >>> sentences = [sent1,sent2] >>> parsed_sents = mp.parse_sents(sentences) >>> print(next(next(parsed_sents)).tree()) (sees John Mary .) >>> print(next(next(parsed_sents)).tree()) (walks John (dog a) .) >>> >>> # Initialize a MaltParser object with an English pre-trained model. >>> parser_dirname = 'maltparser-1.7.2' >>> model_name = 'engmalt.linear-1.7.mco' >>> mp = MaltParser(parser_dirname=parser_dirname, model_filename=model_name, tagger=pos_tag) >>> sent1 = 'I shot an elephant in my pajamas .'.split() >>> sent2 = 'Time flies like banana .'.split() >>> # Parse a single sentence. >>> print(mp.parse_one(sent1).tree()) (shot I (elephant an) (in (pajamas my)) .) # Parsing multiple sentences >>> sentences = [sent1,sent2] >>> parsed_sents = mp.parse_sents(sentences) >>> print(next(next(parsed_sents)).tree()) (shot I (elephant an) (in (pajamas my)) .) >>> print(next(next(parsed_sents)).tree()) (flies Time (like banana) .) ''' import doctest doctest.testmod() nltk-3.1/nltk/parse/nonprojectivedependencyparser.py0000644000076500000240000007114712607224144022665 0ustar sbstaff00000000000000# Natural Language Toolkit: Dependency Grammars # # Copyright (C) 2001-2015 NLTK Project # Author: Jason Narad # # URL: # For license information, see LICENSE.TXT # from __future__ import print_function import math import logging from nltk.compat import xrange from nltk.parse.dependencygraph import DependencyGraph logger = logging.getLogger(__name__) ################################################################# # DependencyScorerI - Interface for Graph-Edge Weight Calculation ################################################################# class DependencyScorerI(object): """ A scorer for calculated the weights on the edges of a weighted dependency graph. This is used by a ``ProbabilisticNonprojectiveParser`` to initialize the edge weights of a ``DependencyGraph``. While typically this would be done by training a binary classifier, any class that can return a multidimensional list representation of the edge weights can implement this interface. As such, it has no necessary fields. """ def __init__(self): if self.__class__ == DependencyScorerI: raise TypeError('DependencyScorerI is an abstract interface') def train(self, graphs): """ :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. Typically the edges present in the graphs can be used as positive training examples, and the edges not present as negative examples. """ raise NotImplementedError() def score(self, graph): """ :type graph: DependencyGraph :param graph: A dependency graph whose set of edges need to be scored. :rtype: A three-dimensional list of numbers. :return: The score is returned in a multidimensional(3) list, such that the outer-dimension refers to the head, and the inner-dimension refers to the dependencies. For instance, scores[0][1] would reference the list of scores corresponding to arcs from node 0 to node 1. The node's 'address' field can be used to determine its number identification. For further illustration, a score list corresponding to Fig.2 of Keith Hall's 'K-best Spanning Tree Parsing' paper: scores = [[[], [5], [1], [1]], [[], [], [11], [4]], [[], [10], [], [5]], [[], [8], [8], []]] When used in conjunction with a MaxEntClassifier, each score would correspond to the confidence of a particular edge being classified with the positive training examples. """ raise NotImplementedError() ################################################################# # NaiveBayesDependencyScorer ################################################################# class NaiveBayesDependencyScorer(DependencyScorerI): """ A dependency scorer built around a MaxEnt classifier. In this particular class that classifier is a ``NaiveBayesClassifier``. It uses head-word, head-tag, child-word, and child-tag features for classification. >>> from nltk.parse.dependencygraph import DependencyGraph, conll_data2 >>> graphs = [DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry] >>> npp = ProbabilisticNonprojectiveParser() >>> npp.train(graphs, NaiveBayesDependencyScorer()) >>> parses = npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']) >>> len(list(parses)) 1 """ def __init__(self): pass # Do nothing without throwing error def train(self, graphs): """ Trains a ``NaiveBayesClassifier`` using the edges present in graphs list as positive examples, the edges not present as negative examples. Uses a feature vector of head-word, head-tag, child-word, and child-tag. :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. """ from nltk.classify import NaiveBayesClassifier # Create training labeled training examples labeled_examples = [] for graph in graphs: for head_node in graph.nodes.values(): for child_index, child_node in graph.nodes.items(): if child_index in head_node['deps']: label = "T" else: label = "F" labeled_examples.append( ( dict( a=head_node['word'], b=head_node['tag'], c=child_node['word'], d=child_node['tag'], ), label, ) ) self.classifier = NaiveBayesClassifier.train(labeled_examples) def score(self, graph): """ Converts the graph into a feature-based representation of each edge, and then assigns a score to each based on the confidence of the classifier in assigning it to the positive label. Scores are returned in a multidimensional list. :type graph: DependencyGraph :param graph: A dependency graph to score. :rtype: 3 dimensional list :return: Edge scores for the graph parameter. """ # Convert graph to feature representation edges = [] for head_node in graph.nodes.values(): for child_node in graph.nodes.values(): edges.append( ( dict( a=head_node['word'], b=head_node['tag'], c=child_node['word'], d=child_node['tag'], ) ) ) # Score edges edge_scores = [] row = [] count = 0 for pdist in self.classifier.prob_classify_many(edges): logger.debug('%.4f %.4f', pdist.prob('T'), pdist.prob('F')) # smoothing in case the probability = 0 row.append([math.log(pdist.prob("T")+0.00000000001)]) count += 1 if count == len(graph.nodes): edge_scores.append(row) row = [] count = 0 return edge_scores ################################################################# # A Scorer for Demo Purposes ################################################################# # A short class necessary to show parsing example from paper class DemoScorer(DependencyScorerI): def train(self, graphs): print('Training...') def score(self, graph): # scores for Keith Hall 'K-best Spanning Tree Parsing' paper return [[[], [5], [1], [1]], [[], [], [11], [4]], [[], [10], [], [5]], [[], [8], [8], []]] ################################################################# # Non-Projective Probabilistic Parsing ################################################################# class ProbabilisticNonprojectiveParser(object): """A probabilistic non-projective dependency parser. Nonprojective dependencies allows for "crossing branches" in the parse tree which is necessary for representing particular linguistic phenomena, or even typical parses in some languages. This parser follows the MST parsing algorithm, outlined in McDonald(2005), which likens the search for the best non-projective parse to finding the maximum spanning tree in a weighted directed graph. >>> class Scorer(DependencyScorerI): ... def train(self, graphs): ... pass ... ... def score(self, graph): ... return [ ... [[], [5], [1], [1]], ... [[], [], [11], [4]], ... [[], [10], [], [5]], ... [[], [8], [8], []], ... ] >>> npp = ProbabilisticNonprojectiveParser() >>> npp.train([], Scorer()) >>> parses = npp.parse(['v1', 'v2', 'v3'], [None, None, None]) >>> len(list(parses)) 1 Rule based example ------------------ >>> from nltk.grammar import DependencyGrammar >>> grammar = DependencyGrammar.fromstring(''' ... 'taught' -> 'play' | 'man' ... 'man' -> 'the' | 'in' ... 'in' -> 'corner' ... 'corner' -> 'the' ... 'play' -> 'golf' | 'dachshund' | 'to' ... 'dachshund' -> 'his' ... ''') >>> ndp = NonprojectiveDependencyParser(grammar) >>> parses = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf']) >>> len(list(parses)) 4 """ def __init__(self): """ Creates a new non-projective parser. """ logging.debug('initializing prob. nonprojective...') def train(self, graphs, dependency_scorer): """ Trains a ``DependencyScorerI`` from a set of ``DependencyGraph`` objects, and establishes this as the parser's scorer. This is used to initialize the scores on a ``DependencyGraph`` during the parsing procedure. :type graphs: list(DependencyGraph) :param graphs: A list of dependency graphs to train the scorer. :type dependency_scorer: DependencyScorerI :param dependency_scorer: A scorer which implements the ``DependencyScorerI`` interface. """ self._scorer = dependency_scorer self._scorer.train(graphs) def initialize_edge_scores(self, graph): """ Assigns a score to every edge in the ``DependencyGraph`` graph. These scores are generated via the parser's scorer which was assigned during the training process. :type graph: DependencyGraph :param graph: A dependency graph to assign scores to. """ self.scores = self._scorer.score(graph) def collapse_nodes(self, new_node, cycle_path, g_graph, b_graph, c_graph): """ Takes a list of nodes that have been identified to belong to a cycle, and collapses them into on larger node. The arcs of all nodes in the graph must be updated to account for this. :type new_node: Node. :param new_node: A Node (Dictionary) to collapse the cycle nodes into. :type cycle_path: A list of integers. :param cycle_path: A list of node addresses, each of which is in the cycle. :type g_graph, b_graph, c_graph: DependencyGraph :param g_graph, b_graph, c_graph: Graphs which need to be updated. """ logger.debug('Collapsing nodes...') # Collapse all cycle nodes into v_n+1 in G_Graph for cycle_node_index in cycle_path: g_graph.remove_by_address(cycle_node_index) g_graph.add_node(new_node) g_graph.redirect_arcs(cycle_path, new_node['address']) def update_edge_scores(self, new_node, cycle_path): """ Updates the edge scores to reflect a collapse operation into new_node. :type new_node: A Node. :param new_node: The node which cycle nodes are collapsed into. :type cycle_path: A list of integers. :param cycle_path: A list of node addresses that belong to the cycle. """ logger.debug('cycle %s', cycle_path) cycle_path = self.compute_original_indexes(cycle_path) logger.debug('old cycle %s', cycle_path) logger.debug('Prior to update: %s', self.scores) for i, row in enumerate(self.scores): for j, column in enumerate(self.scores[i]): logger.debug(self.scores[i][j]) if ( j in cycle_path and i not in cycle_path and self.scores[i][j] ): subtract_val = self.compute_max_subtract_score(j, cycle_path) logger.debug('%s - %s', self.scores[i][j], subtract_val) new_vals = [] for cur_val in self.scores[i][j]: new_vals.append(cur_val - subtract_val) self.scores[i][j] = new_vals for i, row in enumerate(self.scores): for j, cell in enumerate(self.scores[i]): if i in cycle_path and j in cycle_path: self.scores[i][j] = [] logger.debug('After update: %s', self.scores) def compute_original_indexes(self, new_indexes): """ As nodes are collapsed into others, they are replaced by the new node in the graph, but it's still necessary to keep track of what these original nodes were. This takes a list of node addresses and replaces any collapsed node addresses with their original addresses. :type new_indexes: A list of integers. :param new_indexes: A list of node addresses to check for subsumed nodes. """ swapped = True while swapped: originals = [] swapped = False for new_index in new_indexes: if new_index in self.inner_nodes: for old_val in self.inner_nodes[new_index]: if old_val not in originals: originals.append(old_val) swapped = True else: originals.append(new_index) new_indexes = originals return new_indexes def compute_max_subtract_score(self, column_index, cycle_indexes): """ When updating scores the score of the highest-weighted incoming arc is subtracted upon collapse. This returns the correct amount to subtract from that edge. :type column_index: integer. :param column_index: A index representing the column of incoming arcs to a particular node being updated :type cycle_indexes: A list of integers. :param cycle_indexes: Only arcs from cycle nodes are considered. This is a list of such nodes addresses. """ max_score = -100000 for row_index in cycle_indexes: for subtract_val in self.scores[row_index][column_index]: if subtract_val > max_score: max_score = subtract_val return max_score def best_incoming_arc(self, node_index): """ Returns the source of the best incoming arc to the node with address: node_index :type node_index: integer. :param node_index: The address of the 'destination' node, the node that is arced to. """ originals = self.compute_original_indexes([node_index]) logger.debug('originals: %s', originals) max_arc = None max_score = None for row_index in range(len(self.scores)): for col_index in range(len(self.scores[row_index])): # print self.scores[row_index][col_index] if col_index in originals and (max_score is None or self.scores[row_index][col_index] > max_score): max_score = self.scores[row_index][col_index] max_arc = row_index logger.debug('%s, %s', row_index, col_index) logger.debug(max_score) for key in self.inner_nodes: replaced_nodes = self.inner_nodes[key] if max_arc in replaced_nodes: return key return max_arc def original_best_arc(self, node_index): originals = self.compute_original_indexes([node_index]) max_arc = None max_score = None max_orig = None for row_index in range(len(self.scores)): for col_index in range(len(self.scores[row_index])): if col_index in originals and (max_score is None or self.scores[row_index][col_index] > max_score): max_score = self.scores[row_index][col_index] max_arc = row_index max_orig = col_index return [max_arc, max_orig] def parse(self, tokens, tags): """ Parses a list of tokens in accordance to the MST parsing algorithm for non-projective dependency parses. Assumes that the tokens to be parsed have already been tagged and those tags are provided. Various scoring methods can be used by implementing the ``DependencyScorerI`` interface and passing it to the training algorithm. :type tokens: list(str) :param tokens: A list of words or punctuation to be parsed. :type tags: list(str) :param tags: A list of tags corresponding by index to the words in the tokens list. :return: An iterator of non-projective parses. :rtype: iter(DependencyGraph) """ self.inner_nodes = {} # Initialize g_graph g_graph = DependencyGraph() for index, token in enumerate(tokens): g_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) #print (g_graph.nodes) # Fully connect non-root nodes in g_graph g_graph.connect_graph() original_graph = DependencyGraph() for index, token in enumerate(tokens): original_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index+1, } ) b_graph = DependencyGraph() c_graph = DependencyGraph() for index, token in enumerate(tokens): c_graph.nodes[index + 1].update( { 'word': token, 'tag': tags[index], 'rel': 'NTOP', 'address': index + 1, } ) # Assign initial scores to g_graph edges self.initialize_edge_scores(g_graph) logger.debug(self.scores) # Initialize a list of unvisited vertices (by node address) unvisited_vertices = [ vertex['address'] for vertex in c_graph.nodes.values() ] # Iterate over unvisited vertices nr_vertices = len(tokens) betas = {} while unvisited_vertices: # Mark current node as visited current_vertex = unvisited_vertices.pop(0) logger.debug('current_vertex: %s', current_vertex) # Get corresponding node n_i to vertex v_i current_node = g_graph.get_by_address(current_vertex) logger.debug('current_node: %s', current_node) # Get best in-edge node b for current node best_in_edge = self.best_incoming_arc(current_vertex) betas[current_vertex] = self.original_best_arc(current_vertex) logger.debug('best in arc: %s --> %s', best_in_edge, current_vertex) # b_graph = Union(b_graph, b) for new_vertex in [current_vertex, best_in_edge]: b_graph.nodes[new_vertex].update( { 'word': 'TEMP', 'rel': 'NTOP', 'address': new_vertex, } ) b_graph.add_arc(best_in_edge, current_vertex) # Beta(current node) = b - stored for parse recovery # If b_graph contains a cycle, collapse it cycle_path = b_graph.contains_cycle() if cycle_path: # Create a new node v_n+1 with address = len(nodes) + 1 new_node = { 'word': 'NONE', 'rel': 'NTOP', 'address': nr_vertices + 1, } # c_graph = Union(c_graph, v_n+1) c_graph.add_node(new_node) # Collapse all nodes in cycle C into v_n+1 self.update_edge_scores(new_node, cycle_path) self.collapse_nodes(new_node, cycle_path, g_graph, b_graph, c_graph) for cycle_index in cycle_path: c_graph.add_arc(new_node['address'], cycle_index) # self.replaced_by[cycle_index] = new_node['address'] self.inner_nodes[new_node['address']] = cycle_path # Add v_n+1 to list of unvisited vertices unvisited_vertices.insert(0, nr_vertices + 1) # increment # of nodes counter nr_vertices += 1 # Remove cycle nodes from b_graph; B = B - cycle c for cycle_node_address in cycle_path: b_graph.remove_by_address(cycle_node_address) logger.debug('g_graph: %s', g_graph) logger.debug('b_graph: %s', b_graph) logger.debug('c_graph: %s', c_graph) logger.debug('Betas: %s', betas) logger.debug('replaced nodes %s', self.inner_nodes) # Recover parse tree logger.debug('Final scores: %s', self.scores) logger.debug('Recovering parse...') for i in range(len(tokens) + 1, nr_vertices + 1): betas[betas[i][1]] = betas[i] logger.debug('Betas: %s', betas) for node in original_graph.nodes.values(): # TODO: It's dangerous to assume that deps it a dictionary # because it's a default dictionary. Ideally, here we should not # be concerned how dependencies are stored inside of a dependency # graph. node['deps'] = {} for i in range(1, len(tokens) + 1): original_graph.add_arc(betas[i][0], betas[i][1]) logger.debug('Done.') yield original_graph ################################################################# # Rule-based Non-Projective Parser ################################################################# class NonprojectiveDependencyParser(object): """ A non-projective, rule-based, dependency parser. This parser will return the set of all possible non-projective parses based on the word-to-word relations defined in the parser's dependency grammar, and will allow the branches of the parse tree to cross in order to capture a variety of linguistic phenomena that a projective parser will not. """ def __init__(self, dependency_grammar): """ Creates a new ``NonprojectiveDependencyParser``. :param dependency_grammar: a grammar of word-to-word relations. :type dependency_grammar: DependencyGrammar """ self._grammar = dependency_grammar def parse(self, tokens): """ Parses the input tokens with respect to the parser's grammar. Parsing is accomplished by representing the search-space of possible parses as a fully-connected directed graph. Arcs that would lead to ungrammatical parses are removed and a lattice is constructed of length n, where n is the number of input tokens, to represent all possible grammatical traversals. All possible paths through the lattice are then enumerated to produce the set of non-projective parses. param tokens: A list of tokens to parse. type tokens: list(str) return: An iterator of non-projective parses. rtype: iter(DependencyGraph) """ # Create graph representation of tokens self._graph = DependencyGraph() for index, token in enumerate(tokens): self._graph.nodes[index] = { 'word': token, 'deps': [], 'rel': 'NTOP', 'address': index, } for head_node in self._graph.nodes.values(): deps = [] for dep_node in self._graph.nodes.values() : if ( self._grammar.contains(head_node['word'], dep_node['word']) and head_node['word'] != dep_node['word'] ): deps.append(dep_node['address']) head_node['deps'] = deps # Create lattice of possible heads roots = [] possible_heads = [] for i, word in enumerate(tokens): heads = [] for j, head in enumerate(tokens): if (i != j) and self._grammar.contains(head, word): heads.append(j) if len(heads) == 0: roots.append(i) possible_heads.append(heads) # Set roots to attempt if len(roots) < 2: if len(roots) == 0: for i in range(len(tokens)): roots.append(i) # Traverse lattice analyses = [] for root in roots: stack = [] analysis = [[] for i in range(len(possible_heads))] i = 0 forward = True while i >= 0: if forward: if len(possible_heads[i]) == 1: analysis[i] = possible_heads[i][0] elif len(possible_heads[i]) == 0: analysis[i] = -1 else: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) if not forward: index_on_stack = False for stack_item in stack: if stack_item[0] == i: index_on_stack = True orig_length = len(possible_heads[i]) if index_on_stack and orig_length == 0: for j in xrange(len(stack) - 1, -1, -1): stack_item = stack[j] if stack_item[0] == i: possible_heads[i].append(stack.pop(j)[1]) elif index_on_stack and orig_length > 0: head = possible_heads[i].pop() analysis[i] = head stack.append([i, head]) forward = True if i + 1 == len(possible_heads): analyses.append(analysis[:]) forward = False if forward: i += 1 else: i -= 1 # Filter parses # ensure 1 root, every thing has 1 head for analysis in analyses: if analysis.count(-1) > 1: # there are several root elements! continue graph = DependencyGraph() graph.root = graph.nodes[analysis.index(-1) + 1] for address, (token, head_index) in enumerate(zip(tokens, analysis), start=1): head_address = head_index + 1 node = graph.nodes[address] node.update( { 'word': token, 'address': address, } ) if head_address == 0: rel = 'ROOT' else: rel = '' graph.nodes[head_index + 1]['deps'][rel].append(address) # TODO: check for cycles yield graph ################################################################# # Demos ################################################################# def demo(): # hall_demo() nonprojective_conll_parse_demo() rule_based_demo() def hall_demo(): npp = ProbabilisticNonprojectiveParser() npp.train([], DemoScorer()) for parse_graph in npp.parse(['v1', 'v2', 'v3'], [None, None, None]): print(parse_graph) def nonprojective_conll_parse_demo(): from nltk.parse.dependencygraph import conll_data2 graphs = [ DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry ] npp = ProbabilisticNonprojectiveParser() npp.train(graphs, NaiveBayesDependencyScorer()) for parse_graph in npp.parse(['Cathy', 'zag', 'hen', 'zwaaien', '.'], ['N', 'V', 'Pron', 'Adj', 'N', 'Punc']): print(parse_graph) def rule_based_demo(): from nltk.grammar import DependencyGrammar grammar = DependencyGrammar.fromstring(""" 'taught' -> 'play' | 'man' 'man' -> 'the' | 'in' 'in' -> 'corner' 'corner' -> 'the' 'play' -> 'golf' | 'dachshund' | 'to' 'dachshund' -> 'his' """) print(grammar) ndp = NonprojectiveDependencyParser(grammar) graphs = ndp.parse(['the', 'man', 'in', 'the', 'corner', 'taught', 'his', 'dachshund', 'to', 'play', 'golf']) print('Graphs:') for graph in graphs: print(graph) if __name__ == '__main__': demo() nltk-3.1/nltk/parse/pchart.py0000644000076500000240000004661112607224144016003 0ustar sbstaff00000000000000# Natural Language Toolkit: Probabilistic Chart Parsers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT """ Classes and interfaces for associating probabilities with tree structures that represent the internal organization of a text. The probabilistic parser module defines ``BottomUpProbabilisticChartParser``. ``BottomUpProbabilisticChartParser`` is an abstract class that implements a bottom-up chart parser for ``PCFG`` grammars. It maintains a queue of edges, and adds them to the chart one at a time. The ordering of this queue is based on the probabilities associated with the edges, allowing the parser to expand more likely edges before less likely ones. Each subclass implements a different queue ordering, producing different search strategies. Currently the following subclasses are defined: - ``InsideChartParser`` searches edges in decreasing order of their trees' inside probabilities. - ``RandomChartParser`` searches edges in random order. - ``LongestChartParser`` searches edges in decreasing order of their location's length. The ``BottomUpProbabilisticChartParser`` constructor has an optional argument beam_size. If non-zero, this controls the size of the beam (aka the edge queue). This option is most useful with InsideChartParser. """ from __future__ import print_function, unicode_literals ##////////////////////////////////////////////////////// ## Bottom-Up PCFG Chart Parser ##////////////////////////////////////////////////////// # [XX] This might not be implemented quite right -- it would be better # to associate probabilities with child pointer lists. from functools import reduce from nltk.tree import Tree, ProbabilisticTree from nltk.grammar import Nonterminal, PCFG from nltk.parse.api import ParserI from nltk.parse.chart import Chart, LeafEdge, TreeEdge, AbstractChartRule from nltk.compat import python_2_unicode_compatible # Probabilistic edges class ProbabilisticLeafEdge(LeafEdge): def prob(self): return 1.0 class ProbabilisticTreeEdge(TreeEdge): def __init__(self, prob, *args, **kwargs): TreeEdge.__init__(self, *args, **kwargs) self._prob = prob # two edges with different probabilities are not equal. self._comparison_key = (self._comparison_key, prob) def prob(self): return self._prob @staticmethod def from_production(production, index, p): return ProbabilisticTreeEdge(p, (index, index), production.lhs(), production.rhs(), 0) # Rules using probabilistic edges class ProbabilisticBottomUpInitRule(AbstractChartRule): NUM_EDGES=0 def apply(self, chart, grammar): for index in range(chart.num_leaves()): new_edge = ProbabilisticLeafEdge(chart.leaf(index), index) if chart.insert(new_edge, ()): yield new_edge class ProbabilisticBottomUpPredictRule(AbstractChartRule): NUM_EDGES=1 def apply(self, chart, grammar, edge): if edge.is_incomplete(): return for prod in grammar.productions(): if edge.lhs() == prod.rhs()[0]: new_edge = ProbabilisticTreeEdge.from_production(prod, edge.start(), prod.prob()) if chart.insert(new_edge, ()): yield new_edge class ProbabilisticFundamentalRule(AbstractChartRule): NUM_EDGES=2 def apply(self, chart, grammar, left_edge, right_edge): # Make sure the rule is applicable. if not (left_edge.end() == right_edge.start() and left_edge.nextsym() == right_edge.lhs() and left_edge.is_incomplete() and right_edge.is_complete()): return # Construct the new edge. p = left_edge.prob() * right_edge.prob() new_edge = ProbabilisticTreeEdge(p, span=(left_edge.start(), right_edge.end()), lhs=left_edge.lhs(), rhs=left_edge.rhs(), dot=left_edge.dot()+1) # Add it to the chart, with appropriate child pointers. changed_chart = False for cpl1 in chart.child_pointer_lists(left_edge): if chart.insert(new_edge, cpl1+(right_edge,)): changed_chart = True # If we changed the chart, then generate the edge. if changed_chart: yield new_edge @python_2_unicode_compatible class SingleEdgeProbabilisticFundamentalRule(AbstractChartRule): NUM_EDGES=1 _fundamental_rule = ProbabilisticFundamentalRule() def apply(self, chart, grammar, edge1): fr = self._fundamental_rule if edge1.is_incomplete(): # edge1 = left_edge; edge2 = right_edge for edge2 in chart.select(start=edge1.end(), is_complete=True, lhs=edge1.nextsym()): for new_edge in fr.apply(chart, grammar, edge1, edge2): yield new_edge else: # edge2 = left_edge; edge1 = right_edge for edge2 in chart.select(end=edge1.start(), is_complete=False, nextsym=edge1.lhs()): for new_edge in fr.apply(chart, grammar, edge2, edge1): yield new_edge def __str__(self): return 'Fundamental Rule' class BottomUpProbabilisticChartParser(ParserI): """ An abstract bottom-up parser for ``PCFG`` grammars that uses a ``Chart`` to record partial results. ``BottomUpProbabilisticChartParser`` maintains a queue of edges that can be added to the chart. This queue is initialized with edges for each token in the text that is being parsed. ``BottomUpProbabilisticChartParser`` inserts these edges into the chart one at a time, starting with the most likely edges, and proceeding to less likely edges. For each edge that is added to the chart, it may become possible to insert additional edges into the chart; these are added to the queue. This process continues until enough complete parses have been generated, or until the queue is empty. The sorting order for the queue is not specified by ``BottomUpProbabilisticChartParser``. Different sorting orders will result in different search strategies. The sorting order for the queue is defined by the method ``sort_queue``; subclasses are required to provide a definition for this method. :type _grammar: PCFG :ivar _grammar: The grammar used to parse sentences. :type _trace: int :ivar _trace: The level of tracing output that should be generated when parsing a text. """ def __init__(self, grammar, beam_size=0, trace=0): """ Create a new ``BottomUpProbabilisticChartParser``, that uses ``grammar`` to parse texts. :type grammar: PCFG :param grammar: The grammar used to parse texts. :type beam_size: int :param beam_size: The maximum length for the parser's edge queue. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. """ if not isinstance(grammar, PCFG): raise ValueError("The grammar must be probabilistic PCFG") self._grammar = grammar self.beam_size = beam_size self._trace = trace def grammar(self): return self._grammar def trace(self, trace=2): """ Set the level of tracing output that should be generated when parsing a text. :type trace: int :param trace: The trace level. A trace level of ``0`` will generate no tracing output; and higher trace levels will produce more verbose tracing output. :rtype: None """ self._trace = trace # TODO: change this to conform more with the standard ChartParser def parse(self, tokens): self._grammar.check_coverage(tokens) chart = Chart(list(tokens)) grammar = self._grammar # Chart parser rules. bu_init = ProbabilisticBottomUpInitRule() bu = ProbabilisticBottomUpPredictRule() fr = SingleEdgeProbabilisticFundamentalRule() # Our queue queue = [] # Initialize the chart. for edge in bu_init.apply(chart, grammar): if self._trace > 1: print(' %-50s [%s]' % (chart.pretty_format_edge(edge,width=2), edge.prob())) queue.append(edge) while len(queue) > 0: # Re-sort the queue. self.sort_queue(queue, chart) # Prune the queue to the correct size if a beam was defined if self.beam_size: self._prune(queue, chart) # Get the best edge. edge = queue.pop() if self._trace > 0: print(' %-50s [%s]' % (chart.pretty_format_edge(edge,width=2), edge.prob())) # Apply BU & FR to it. queue.extend(bu.apply(chart, grammar, edge)) queue.extend(fr.apply(chart, grammar, edge)) # Get a list of complete parses. parses = list(chart.parses(grammar.start(), ProbabilisticTree)) # Assign probabilities to the trees. prod_probs = {} for prod in grammar.productions(): prod_probs[prod.lhs(), prod.rhs()] = prod.prob() for parse in parses: self._setprob(parse, prod_probs) # Sort by probability parses.sort(reverse=True, key=lambda tree: tree.prob()) return iter(parses) def _setprob(self, tree, prod_probs): if tree.prob() is not None: return # Get the prob of the CFG production. lhs = Nonterminal(tree.label()) rhs = [] for child in tree: if isinstance(child, Tree): rhs.append(Nonterminal(child.label())) else: rhs.append(child) prob = prod_probs[lhs, tuple(rhs)] # Get the probs of children. for child in tree: if isinstance(child, Tree): self._setprob(child, prod_probs) prob *= child.prob() tree.set_prob(prob) def sort_queue(self, queue, chart): """ Sort the given queue of ``Edge`` objects, placing the edge that should be tried first at the beginning of the queue. This method will be called after each ``Edge`` is added to the queue. :param queue: The queue of ``Edge`` objects to sort. Each edge in this queue is an edge that could be added to the chart by the fundamental rule; but that has not yet been added. :type queue: list(Edge) :param chart: The chart being used to parse the text. This chart can be used to provide extra information for sorting the queue. :type chart: Chart :rtype: None """ raise NotImplementedError() def _prune(self, queue, chart): """ Discard items in the queue if the queue is longer than the beam.""" if len(queue) > self.beam_size: split = len(queue)-self.beam_size if self._trace > 2: for edge in queue[:split]: print(' %-50s [DISCARDED]' % chart.pretty_format_edge(edge,2)) del queue[:split] class InsideChartParser(BottomUpProbabilisticChartParser): """ A bottom-up parser for ``PCFG`` grammars that tries edges in descending order of the inside probabilities of their trees. The "inside probability" of a tree is simply the probability of the entire tree, ignoring its context. In particular, the inside probability of a tree generated by production *p* with children *c[1], c[2], ..., c[n]* is *P(p)P(c[1])P(c[2])...P(c[n])*; and the inside probability of a token is 1 if it is present in the text, and 0 if it is absent. This sorting order results in a type of lowest-cost-first search strategy. """ # Inherit constructor. def sort_queue(self, queue, chart): """ Sort the given queue of edges, in descending order of the inside probabilities of the edges' trees. :param queue: The queue of ``Edge`` objects to sort. Each edge in this queue is an edge that could be added to the chart by the fundamental rule; but that has not yet been added. :type queue: list(Edge) :param chart: The chart being used to parse the text. This chart can be used to provide extra information for sorting the queue. :type chart: Chart :rtype: None """ queue.sort(key=lambda edge: edge.prob()) # Eventually, this will become some sort of inside-outside parser: # class InsideOutsideParser(BottomUpProbabilisticChartParser): # def __init__(self, grammar, trace=0): # # Inherit docs. # BottomUpProbabilisticChartParser.__init__(self, grammar, trace) # # # Find the best path from S to each nonterminal # bestp = {} # for production in grammar.productions(): bestp[production.lhs()]=0 # bestp[grammar.start()] = 1.0 # # for i in range(len(grammar.productions())): # for production in grammar.productions(): # lhs = production.lhs() # for elt in production.rhs(): # bestp[elt] = max(bestp[lhs]*production.prob(), # bestp.get(elt,0)) # # self._bestp = bestp # for (k,v) in self._bestp.items(): print k,v # # def _sortkey(self, edge): # return edge.structure()[PROB] * self._bestp[edge.lhs()] # # def sort_queue(self, queue, chart): # queue.sort(key=self._sortkey) import random class RandomChartParser(BottomUpProbabilisticChartParser): """ A bottom-up parser for ``PCFG`` grammars that tries edges in random order. This sorting order results in a random search strategy. """ # Inherit constructor def sort_queue(self, queue, chart): i = random.randint(0, len(queue)-1) (queue[-1], queue[i]) = (queue[i], queue[-1]) class UnsortedChartParser(BottomUpProbabilisticChartParser): """ A bottom-up parser for ``PCFG`` grammars that tries edges in whatever order. """ # Inherit constructor def sort_queue(self, queue, chart): return class LongestChartParser(BottomUpProbabilisticChartParser): """ A bottom-up parser for ``PCFG`` grammars that tries longer edges before shorter ones. This sorting order results in a type of best-first search strategy. """ # Inherit constructor def sort_queue(self, queue, chart): queue.sort(key=lambda edge: edge.length()) ##////////////////////////////////////////////////////// ## Test Code ##////////////////////////////////////////////////////// def demo(choice=None, draw_parses=None, print_parses=None): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import pchart # Define two demos. Each demo has a sentence and a grammar. toy_pcfg1 = PCFG.fromstring(""" S -> NP VP [1.0] NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] Det -> 'the' [0.8] | 'my' [0.2] N -> 'man' [0.5] | 'telescope' [0.5] VP -> VP PP [0.1] | V NP [0.7] | V [0.2] V -> 'ate' [0.35] | 'saw' [0.65] PP -> P NP [1.0] P -> 'with' [0.61] | 'under' [0.39] """) toy_pcfg2 = PCFG.fromstring(""" S -> NP VP [1.0] VP -> V NP [.59] VP -> V [.40] VP -> VP PP [.01] NP -> Det N [.41] NP -> Name [.28] NP -> NP PP [.31] PP -> P NP [1.0] V -> 'saw' [.21] V -> 'ate' [.51] V -> 'ran' [.28] N -> 'boy' [.11] N -> 'cookie' [.12] N -> 'table' [.13] N -> 'telescope' [.14] N -> 'hill' [.5] Name -> 'Jack' [.52] Name -> 'Bob' [.48] P -> 'with' [.61] P -> 'under' [.39] Det -> 'the' [.41] Det -> 'a' [.31] Det -> 'my' [.28] """) demos = [('I saw John with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2)] if choice is None: # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i+1, demos[i][0])) print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') choice = int(sys.stdin.readline().strip())-1 try: sent, grammar = demos[choice] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() # Define a list of parsers. We'll use all parsers. parsers = [ pchart.InsideChartParser(grammar), pchart.RandomChartParser(grammar), pchart.UnsortedChartParser(grammar), pchart.LongestChartParser(grammar), pchart.InsideChartParser(grammar, beam_size = len(tokens)+1) # was BeamParser ] # Run the parsers on the tokenized sentence. times = [] average_p = [] num_parses = [] all_parses = {} for parser in parsers: print('\ns: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)) parser.trace(3) t = time.time() parses = list(parser.parse(tokens)) times.append(time.time()-t) p = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0) average_p.append(p) num_parses.append(len(parses)) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print(' Parser Beam | Time (secs) # Parses Average P(parse)') print('------------------------+------------------------------------------') for i in range(len(parsers)): print('%18s %4d |%11.4f%11d%19.14f' % (parsers[i].__class__.__name__, parsers[i].beam_size, times[i],num_parses[i],average_p[i])) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 print('------------------------+------------------------------------------') print('%18s |%11s%11d%19.14f' % ('(All Parses)', 'n/a', len(parses), p)) if draw_parses is None: # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') draw_parses = sys.stdin.readline().strip().lower().startswith('y') if draw_parses: from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) if print_parses is None: # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') print_parses = sys.stdin.readline().strip().lower().startswith('y') if print_parses: for parse in parses: print(parse) if __name__ == '__main__': demo() nltk-3.1/nltk/parse/projectivedependencyparser.py0000644000076500000240000006076612607224144022157 0ustar sbstaff00000000000000# Natural Language Toolkit: Dependency Grammars # # Copyright (C) 2001-2015 NLTK Project # Author: Jason Narad # # URL: # For license information, see LICENSE.TXT # from __future__ import print_function, unicode_literals from collections import defaultdict from nltk.grammar import (DependencyProduction, DependencyGrammar, ProbabilisticDependencyGrammar) from nltk.parse.dependencygraph import DependencyGraph from nltk.internals import raise_unorderable_types from nltk.compat import total_ordering, python_2_unicode_compatible ################################################################# # Dependency Span ################################################################# @total_ordering @python_2_unicode_compatible class DependencySpan(object): """ A contiguous span over some part of the input string representing dependency (head -> modifier) relationships amongst words. An atomic span corresponds to only one word so it isn't a 'span' in the conventional sense, as its _start_index = _end_index = _head_index for concatenation purposes. All other spans are assumed to have arcs between all nodes within the start and end indexes of the span, and one head index corresponding to the head word for the entire span. This is the same as the root node if the dependency structure were depicted as a graph. """ def __init__(self, start_index, end_index, head_index, arcs, tags): self._start_index = start_index self._end_index = end_index self._head_index = head_index self._arcs = arcs self._tags = tags self._comparison_key = (start_index, end_index, head_index, tuple(arcs)) self._hash = hash(self._comparison_key) def head_index(self): """ :return: An value indexing the head of the entire ``DependencySpan``. :rtype: int """ return self._head_index def __repr__(self): """ :return: A concise string representatino of the ``DependencySpan``. :rtype: str. """ return 'Span %d-%d; Head Index: %d' % (self._start_index, self._end_index, self._head_index) def __str__(self): """ :return: A verbose string representation of the ``DependencySpan``. :rtype: str """ str = 'Span %d-%d; Head Index: %d' % (self._start_index, self._end_index, self._head_index) for i in range(len(self._arcs)): str += '\n%d <- %d, %s' % (i, self._arcs[i], self._tags[i]) return str def __eq__(self, other): return (type(self) == type(other) and self._comparison_key == other._comparison_key) def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, DependencySpan): raise_unorderable_types("<", self, other) return self._comparison_key < other._comparison_key def __hash__(self): """ :return: The hash value of this ``DependencySpan``. """ return self._hash ################################################################# # Chart Cell ################################################################# @python_2_unicode_compatible class ChartCell(object): """ A cell from the parse chart formed when performing the CYK algorithm. Each cell keeps track of its x and y coordinates (though this will probably be discarded), and a list of spans serving as the cell's entries. """ def __init__(self, x, y): """ :param x: This cell's x coordinate. :type x: int. :param y: This cell's y coordinate. :type y: int. """ self._x = x self._y = y self._entries = set([]) def add(self, span): """ Appends the given span to the list of spans representing the chart cell's entries. :param span: The span to add. :type span: DependencySpan """ self._entries.add(span) def __str__(self): """ :return: A verbose string representation of this ``ChartCell``. :rtype: str. """ return 'CC[%d,%d]: %s' % (self._x, self._y, self._entries) def __repr__(self): """ :return: A concise string representation of this ``ChartCell``. :rtype: str. """ return '%s' % self ################################################################# # Parsing with Dependency Grammars ################################################################# class ProjectiveDependencyParser(object): """ A projective, rule-based, dependency parser. A ProjectiveDependencyParser is created with a DependencyGrammar, a set of productions specifying word-to-word dependency relations. The parse() method will then return the set of all parses, in tree representation, for a given input sequence of tokens. Each parse must meet the requirements of the both the grammar and the projectivity constraint which specifies that the branches of the dependency tree are not allowed to cross. Alternatively, this can be understood as stating that each parent node and its children in the parse tree form a continuous substring of the input sequence. """ def __init__(self, dependency_grammar): """ Create a new ProjectiveDependencyParser, from a word-to-word dependency grammar ``DependencyGrammar``. :param dependency_grammar: A word-to-word relation dependencygrammar. :type dependency_grammar: DependencyGrammar """ self._grammar = dependency_grammar def parse(self, tokens): """ Performs a projective dependency parse on the list of tokens using a chart-based, span-concatenation algorithm similar to Eisner (1996). :param tokens: The list of input tokens. :type tokens: list(str) :return: An iterator over parse trees. :rtype: iter(Tree) """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i,j)) if i==j+1: chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], ['null'])) for i in range(1,len(self._tokens)+1): for j in range(i-2,-1,-1): for k in range(i-1,j,-1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) for parse in chart[len(self._tokens)][0]._entries: conll_format = "" # malt_format = "" for i in range(len(tokens)): # malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with the new Dependency Graph requirement (at least must have an root elements) conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], 'null', 'null', 'null', parse._arcs[i] + 1, 'ROOT', '-', '-') dg = DependencyGraph(conll_format) # if self.meets_arity(dg): yield dg.tree() def concatenate(self, span1, span2): """ Concatenates the two spans in whichever way possible. This includes rightward concatenation (from the leftmost word of the leftmost span to the rightmost word of the rightmost span) and leftward concatenation (vice-versa) between adjacent spans. Unlike Eisner's presentation of span concatenation, these spans do not share or pivot on a particular word/word-index. :return: A list of new spans formed through concatenation. :rtype: list(DependencySpan) """ spans = [] if span1._start_index == span2._start_index: print('Error: Mismatched spans - replace this with thrown error') if span1._start_index > span2._start_index: temp_span = span1 span1 = span2 span2 = temp_span # adjacent rightward covered concatenation new_arcs = span1._arcs + span2._arcs new_tags = span1._tags + span2._tags if self._grammar.contains(self._tokens[span1._head_index], self._tokens[span2._head_index]): # print 'Performing rightward cover %d to %d' % (span1._head_index, span2._head_index) new_arcs[span2._head_index - span1._start_index] = span1._head_index spans.append(DependencySpan(span1._start_index, span2._end_index, span1._head_index, new_arcs, new_tags)) # adjacent leftward covered concatenation new_arcs = span1._arcs + span2._arcs if self._grammar.contains(self._tokens[span2._head_index], self._tokens[span1._head_index]): # print 'performing leftward cover %d to %d' % (span2._head_index, span1._head_index) new_arcs[span1._head_index - span1._start_index] = span2._head_index spans.append(DependencySpan(span1._start_index, span2._end_index, span2._head_index, new_arcs, new_tags)) return spans ################################################################# # Parsing with Probabilistic Dependency Grammars ################################################################# class ProbabilisticProjectiveDependencyParser(object): """A probabilistic, projective dependency parser. This parser returns the most probable projective parse derived from the probabilistic dependency grammar derived from the train() method. The probabilistic model is an implementation of Eisner's (1996) Model C, which conditions on head-word, head-tag, child-word, and child-tag. The decoding uses a bottom-up chart-based span concatenation algorithm that's identical to the one utilized by the rule-based projective parser. Usage example ------------- >>> from nltk.parse.dependencygraph import conll_data2 >>> graphs = [ ... DependencyGraph(entry) for entry in conll_data2.split('\\n\\n') if entry ... ] >>> ppdp = ProbabilisticProjectiveDependencyParser() >>> ppdp.train(graphs) >>> sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.'] >>> list(ppdp.parse(sent)) [Tree('zag', ['Cathy', 'hen', Tree('zwaaien', ['wild', '.'])])] """ def __init__(self): """ Create a new probabilistic dependency parser. No additional operations are necessary. """ def parse(self, tokens): """ Parses the list of tokens subject to the projectivity constraint and the productions in the parser's grammar. This uses a method similar to the span-concatenation algorithm defined in Eisner (1996). It returns the most probable parse derived from the parser's probabilistic dependency grammar. """ self._tokens = list(tokens) chart = [] for i in range(0, len(self._tokens) + 1): chart.append([]) for j in range(0, len(self._tokens) + 1): chart[i].append(ChartCell(i,j)) if i==j+1: if tokens[i-1] in self._grammar._tags: for tag in self._grammar._tags[tokens[i-1]]: chart[i][j].add(DependencySpan(i-1,i,i-1,[-1], [tag])) else: print('No tag found for input token \'%s\', parse is impossible.' % tokens[i-1]) return [] for i in range(1,len(self._tokens)+1): for j in range(i-2,-1,-1): for k in range(i-1,j,-1): for span1 in chart[k][j]._entries: for span2 in chart[i][k]._entries: for newspan in self.concatenate(span1, span2): chart[i][j].add(newspan) trees = [] max_parse = None max_score = 0 for parse in chart[len(self._tokens)][0]._entries: conll_format = "" malt_format = "" for i in range(len(tokens)): malt_format += '%s\t%s\t%d\t%s\n' % (tokens[i], 'null', parse._arcs[i] + 1, 'null') #conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'null', '-', '-') # Modify to comply with recent change in dependency graph such that there must be a ROOT element. conll_format += '\t%d\t%s\t%s\t%s\t%s\t%s\t%d\t%s\t%s\t%s\n' % (i+1, tokens[i], tokens[i], parse._tags[i], parse._tags[i], 'null', parse._arcs[i] + 1, 'ROOT', '-', '-') dg = DependencyGraph(conll_format) score = self.compute_prob(dg) trees.append((score, dg.tree())) trees.sort() return (tree for (score, tree) in trees) def concatenate(self, span1, span2): """ Concatenates the two spans in whichever way possible. This includes rightward concatenation (from the leftmost word of the leftmost span to the rightmost word of the rightmost span) and leftward concatenation (vice-versa) between adjacent spans. Unlike Eisner's presentation of span concatenation, these spans do not share or pivot on a particular word/word-index. :return: A list of new spans formed through concatenation. :rtype: list(DependencySpan) """ spans = [] if span1._start_index == span2._start_index: print('Error: Mismatched spans - replace this with thrown error') if span1._start_index > span2._start_index: temp_span = span1 span1 = span2 span2 = temp_span # adjacent rightward covered concatenation new_arcs = span1._arcs + span2._arcs new_tags = span1._tags + span2._tags if self._grammar.contains(self._tokens[span1._head_index], self._tokens[span2._head_index]): new_arcs[span2._head_index - span1._start_index] = span1._head_index spans.append(DependencySpan(span1._start_index, span2._end_index, span1._head_index, new_arcs, new_tags)) # adjacent leftward covered concatenation new_arcs = span1._arcs + span2._arcs new_tags = span1._tags + span2._tags if self._grammar.contains(self._tokens[span2._head_index], self._tokens[span1._head_index]): new_arcs[span1._head_index - span1._start_index] = span2._head_index spans.append(DependencySpan(span1._start_index, span2._end_index, span2._head_index, new_arcs, new_tags)) return spans def train(self, graphs): """ Trains a ProbabilisticDependencyGrammar based on the list of input DependencyGraphs. This model is an implementation of Eisner's (1996) Model C, which derives its statistics from head-word, head-tag, child-word, and child-tag relationships. :param graphs: A list of dependency graphs to train from. :type: list(DependencyGraph) """ productions = [] events = defaultdict(int) tags = {} for dg in graphs: for node_index in range(1, len(dg.nodes)): #children = dg.nodes[node_index]['deps'] # Put list so that in will work in python 3 children = sum(list(dg.nodes[node_index]['deps'].values()), []) nr_left_children = dg.left_children(node_index) nr_right_children = dg.right_children(node_index) nr_children = nr_left_children + nr_right_children for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2): head_word = dg.nodes[node_index]['word'] head_tag = dg.nodes[node_index]['tag'] if head_word in tags: tags[head_word].add(head_tag) else: tags[head_word] = set([head_tag]) child = 'STOP' child_tag = 'STOP' prev_word = 'START' prev_tag = 'START' if child_index < 0: array_index = child_index + nr_left_children if array_index >= 0: child = dg.nodes[children[array_index]]['word'] child_tag = dg.nodes[children[array_index]]['tag'] if child_index != -1: prev_word = dg.nodes[children[array_index + 1]]['word'] prev_tag = dg.nodes[children[array_index + 1]]['tag'] if child != 'STOP': productions.append(DependencyProduction(head_word, [child])) head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (child, child_tag, prev_tag, head_word, head_tag) mod_event = '(mods (%s, %s, %s) left))' % (prev_tag, head_word, head_tag) events[head_event] += 1 events[mod_event] += 1 elif child_index > 0: array_index = child_index + nr_left_children - 1 if array_index < nr_children: child = dg.nodes[children[array_index]]['word'] child_tag = dg.nodes[children[array_index]]['tag'] if child_index != 1: prev_word = dg.nodes[children[array_index - 1]]['word'] prev_tag = dg.nodes[children[array_index - 1]]['tag'] if child != 'STOP': productions.append(DependencyProduction(head_word, [child])) head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (child, child_tag, prev_tag, head_word, head_tag) mod_event = '(mods (%s, %s, %s) right))' % (prev_tag, head_word, head_tag) events[head_event] += 1 events[mod_event] += 1 self._grammar = ProbabilisticDependencyGrammar(productions, events, tags) def compute_prob(self, dg): """ Computes the probability of a dependency graph based on the parser's probability model (defined by the parser's statistical dependency grammar). :param dg: A dependency graph to score. :type dg: DependencyGraph :return: The probability of the dependency graph. :rtype: int """ prob = 1.0 for node_index in range(1, len(dg.nodes)): #children = dg.nodes[node_index]['deps'] children = sum(list(dg.nodes[node_index]['deps'].values()), []) nr_left_children = dg.left_children(node_index) nr_right_children = dg.right_children(node_index) nr_children = nr_left_children + nr_right_children for child_index in range(0 - (nr_left_children + 1), nr_right_children + 2): head_word = dg.nodes[node_index]['word'] head_tag = dg.nodes[node_index]['tag'] child = 'STOP' child_tag = 'STOP' prev_word = 'START' prev_tag = 'START' if child_index < 0: array_index = child_index + nr_left_children if array_index >= 0: child = dg.nodes[children[array_index]]['word'] child_tag = dg.nodes[children[array_index]]['tag'] if child_index != -1: prev_word = dg.nodes[children[array_index + 1]]['word'] prev_tag = dg.nodes[children[array_index + 1]]['tag'] head_event = '(head (%s %s) (mods (%s, %s, %s) left))' % (child, child_tag, prev_tag, head_word, head_tag) mod_event = '(mods (%s, %s, %s) left))' % (prev_tag, head_word, head_tag) h_count = self._grammar._events[head_event] m_count = self._grammar._events[mod_event] # If the grammar is not covered if m_count != 0: prob *= (h_count / m_count) else: prob = 0.00000001 # Very small number elif child_index > 0: array_index = child_index + nr_left_children - 1 if array_index < nr_children: child = dg.nodes[children[array_index]]['word'] child_tag = dg.nodes[children[array_index]]['tag'] if child_index != 1: prev_word = dg.nodes[children[array_index - 1]]['word'] prev_tag = dg.nodes[children[array_index - 1]]['tag'] head_event = '(head (%s %s) (mods (%s, %s, %s) right))' % (child, child_tag, prev_tag, head_word, head_tag) mod_event = '(mods (%s, %s, %s) right))' % (prev_tag, head_word, head_tag) h_count = self._grammar._events[head_event] m_count = self._grammar._events[mod_event] if m_count != 0: prob *= (h_count / m_count) else: prob = 0.00000001 # Very small number return prob ################################################################# # Demos ################################################################# def demo(): projective_rule_parse_demo() # arity_parse_demo() projective_prob_parse_demo() def projective_rule_parse_demo(): """ A demonstration showing the creation and use of a ``DependencyGrammar`` to perform a projective dependency parse. """ grammar = DependencyGrammar.fromstring(""" 'scratch' -> 'cats' | 'walls' 'walls' -> 'the' 'cats' -> 'the' """) print(grammar) pdp = ProjectiveDependencyParser(grammar) trees = pdp.parse(['the', 'cats', 'scratch', 'the', 'walls']) for tree in trees: print(tree) def arity_parse_demo(): """ A demonstration showing the creation of a ``DependencyGrammar`` in which a specific number of modifiers is listed for a given head. This can further constrain the number of possible parses created by a ``ProjectiveDependencyParser``. """ print() print('A grammar with no arity constraints. Each DependencyProduction') print('specifies a relationship between one head word and only one') print('modifier word.') grammar = DependencyGrammar.fromstring(""" 'fell' -> 'price' | 'stock' 'price' -> 'of' | 'the' 'of' -> 'stock' 'stock' -> 'the' """) print(grammar) print() print('For the sentence \'The price of the stock fell\', this grammar') print('will produce the following three parses:') pdp = ProjectiveDependencyParser(grammar) trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell']) for tree in trees: print(tree) print() print('By contrast, the following grammar contains a ') print('DependencyProduction that specifies a relationship') print('between a single head word, \'price\', and two modifier') print('words, \'of\' and \'the\'.') grammar = DependencyGrammar.fromstring(""" 'fell' -> 'price' | 'stock' 'price' -> 'of' 'the' 'of' -> 'stock' 'stock' -> 'the' """) print(grammar) print() print('This constrains the number of possible parses to just one:') # unimplemented, soon to replace pdp = ProjectiveDependencyParser(grammar) trees = pdp.parse(['the', 'price', 'of', 'the', 'stock', 'fell']) for tree in trees: print(tree) def projective_prob_parse_demo(): """ A demo showing the training and use of a projective dependency parser. """ from nltk.parse.dependencygraph import conll_data2 graphs = [DependencyGraph(entry) for entry in conll_data2.split('\n\n') if entry] ppdp = ProbabilisticProjectiveDependencyParser() print('Training Probabilistic Projective Dependency Parser...') ppdp.train(graphs) sent = ['Cathy', 'zag', 'hen', 'wild', 'zwaaien', '.'] print('Parsing \'', " ".join(sent), '\'...') print('Parse:') for tree in ppdp.parse(sent): print(tree) if __name__ == '__main__': demo() nltk-3.1/nltk/parse/recursivedescent.py0000644000076500000240000006127512607224144020102 0ustar sbstaff00000000000000# Natural Language Toolkit: Recursive Descent Parser # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals from nltk.grammar import Nonterminal from nltk.tree import Tree, ImmutableTree from nltk.compat import unicode_repr from nltk.parse.api import ParserI ##////////////////////////////////////////////////////// ## Recursive Descent Parser ##////////////////////////////////////////////////////// class RecursiveDescentParser(ParserI): """ A simple top-down CFG parser that parses texts by recursively expanding the fringe of a Tree, and matching it against a text. ``RecursiveDescentParser`` uses a list of tree locations called a "frontier" to remember which subtrees have not yet been expanded and which leaves have not yet been matched against the text. Each tree location consists of a list of child indices specifying the path from the root of the tree to a subtree or a leaf; see the reference documentation for Tree for more information about tree locations. When the parser begins parsing a text, it constructs a tree containing only the start symbol, and a frontier containing the location of the tree's root node. It then extends the tree to cover the text, using the following recursive procedure: - If the frontier is empty, and the text is covered by the tree, then return the tree as a possible parse. - If the frontier is empty, and the text is not covered by the tree, then return no parses. - If the first element of the frontier is a subtree, then use CFG productions to "expand" it. For each applicable production, add the expanded subtree's children to the frontier, and recursively find all parses that can be generated by the new tree and frontier. - If the first element of the frontier is a token, then "match" it against the next token from the text. Remove the token from the frontier, and recursively find all parses that can be generated by the new tree and frontier. :see: ``nltk.grammar`` """ def __init__(self, grammar, trace=0): """ Create a new ``RecursiveDescentParser``, that uses ``grammar`` to parse texts. :type grammar: CFG :param grammar: The grammar used to parse texts. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. """ self._grammar = grammar self._trace = trace def grammar(self): return self._grammar def parse(self, tokens): # Inherit docs from ParserI tokens = list(tokens) self._grammar.check_coverage(tokens) # Start a recursive descent parse, with an initial tree # containing just the start symbol. start = self._grammar.start().symbol() initial_tree = Tree(start, []) frontier = [()] if self._trace: self._trace_start(initial_tree, frontier, tokens) return self._parse(tokens, initial_tree, frontier) def _parse(self, remaining_text, tree, frontier): """ Recursively expand and match each elements of ``tree`` specified by ``frontier``, to cover ``remaining_text``. Return a list of all parses found. :return: An iterator of all parses that can be generated by matching and expanding the elements of ``tree`` specified by ``frontier``. :rtype: iter(Tree) :type tree: Tree :param tree: A partial structure for the text that is currently being parsed. The elements of ``tree`` that are specified by ``frontier`` have not yet been expanded or matched. :type remaining_text: list(str) :param remaining_text: The portion of the text that is not yet covered by ``tree``. :type frontier: list(tuple(int)) :param frontier: A list of the locations within ``tree`` of all subtrees that have not yet been expanded, and all leaves that have not yet been matched. This list sorted in left-to-right order of location within the tree. """ # If the tree covers the text, and there's nothing left to # expand, then we've found a complete parse; return it. if len(remaining_text) == 0 and len(frontier) == 0: if self._trace: self._trace_succeed(tree, frontier) yield tree # If there's still text, but nothing left to expand, we failed. elif len(frontier) == 0: if self._trace: self._trace_backtrack(tree, frontier) # If the next element on the frontier is a tree, expand it. elif isinstance(tree[frontier[0]], Tree): for result in self._expand(remaining_text, tree, frontier): yield result # If the next element on the frontier is a token, match it. else: for result in self._match(remaining_text, tree, frontier): yield result def _match(self, rtext, tree, frontier): """ :rtype: iter(Tree) :return: an iterator of all parses that can be generated by matching the first element of ``frontier`` against the first token in ``rtext``. In particular, if the first element of ``frontier`` has the same type as the first token in ``rtext``, then substitute the token into ``tree``; and return all parses that can be generated by matching and expanding the remaining elements of ``frontier``. If the first element of ``frontier`` does not have the same type as the first token in ``rtext``, then return empty list. :type tree: Tree :param tree: A partial structure for the text that is currently being parsed. The elements of ``tree`` that are specified by ``frontier`` have not yet been expanded or matched. :type rtext: list(str) :param rtext: The portion of the text that is not yet covered by ``tree``. :type frontier: list of tuple of int :param frontier: A list of the locations within ``tree`` of all subtrees that have not yet been expanded, and all leaves that have not yet been matched. """ tree_leaf = tree[frontier[0]] if (len(rtext) > 0 and tree_leaf == rtext[0]): # If it's a terminal that matches rtext[0], then substitute # in the token, and continue parsing. newtree = tree.copy(deep=True) newtree[frontier[0]] = rtext[0] if self._trace: self._trace_match(newtree, frontier[1:], rtext[0]) for result in self._parse(rtext[1:], newtree, frontier[1:]): yield result else: # If it's a non-matching terminal, fail. if self._trace: self._trace_backtrack(tree, frontier, rtext[:1]) def _expand(self, remaining_text, tree, frontier, production=None): """ :rtype: iter(Tree) :return: An iterator of all parses that can be generated by expanding the first element of ``frontier`` with ``production``. In particular, if the first element of ``frontier`` is a subtree whose node type is equal to ``production``'s left hand side, then add a child to that subtree for each element of ``production``'s right hand side; and return all parses that can be generated by matching and expanding the remaining elements of ``frontier``. If the first element of ``frontier`` is not a subtree whose node type is equal to ``production``'s left hand side, then return an empty list. If ``production`` is not specified, then return a list of all parses that can be generated by expanding the first element of ``frontier`` with *any* CFG production. :type tree: Tree :param tree: A partial structure for the text that is currently being parsed. The elements of ``tree`` that are specified by ``frontier`` have not yet been expanded or matched. :type remaining_text: list(str) :param remaining_text: The portion of the text that is not yet covered by ``tree``. :type frontier: list(tuple(int)) :param frontier: A list of the locations within ``tree`` of all subtrees that have not yet been expanded, and all leaves that have not yet been matched. """ if production is None: productions = self._grammar.productions() else: productions = [production] for production in productions: lhs = production.lhs().symbol() if lhs == tree[frontier[0]].label(): subtree = self._production_to_tree(production) if frontier[0] == (): newtree = subtree else: newtree = tree.copy(deep=True) newtree[frontier[0]] = subtree new_frontier = [frontier[0]+(i,) for i in range(len(production.rhs()))] if self._trace: self._trace_expand(newtree, new_frontier, production) for result in self._parse(remaining_text, newtree, new_frontier + frontier[1:]): yield result def _production_to_tree(self, production): """ :rtype: Tree :return: The Tree that is licensed by ``production``. In particular, given the production ``[lhs -> elt[1] ... elt[n]]`` return a tree that has a node ``lhs.symbol``, and ``n`` children. For each nonterminal element ``elt[i]`` in the production, the tree token has a childless subtree with node value ``elt[i].symbol``; and for each terminal element ``elt[j]``, the tree token has a leaf token with type ``elt[j]``. :param production: The CFG production that licenses the tree token that should be returned. :type production: Production """ children = [] for elt in production.rhs(): if isinstance(elt, Nonterminal): children.append(Tree(elt.symbol(), [])) else: # This will be matched. children.append(elt) return Tree(production.lhs().symbol(), children) def trace(self, trace=2): """ Set the level of tracing output that should be generated when parsing a text. :type trace: int :param trace: The trace level. A trace level of ``0`` will generate no tracing output; and higher trace levels will produce more verbose tracing output. :rtype: None """ self._trace = trace def _trace_fringe(self, tree, treeloc=None): """ Print trace output displaying the fringe of ``tree``. The fringe of ``tree`` consists of all of its leaves and all of its childless subtrees. :rtype: None """ if treeloc == (): print("*", end=' ') if isinstance(tree, Tree): if len(tree) == 0: print(unicode_repr(Nonterminal(tree.label())), end=' ') for i in range(len(tree)): if treeloc is not None and i == treeloc[0]: self._trace_fringe(tree[i], treeloc[1:]) else: self._trace_fringe(tree[i]) else: print(unicode_repr(tree), end=' ') def _trace_tree(self, tree, frontier, operation): """ Print trace output displaying the parser's current state. :param operation: A character identifying the operation that generated the current state. :rtype: None """ if self._trace == 2: print(' %c [' % operation, end=' ') else: print(' [', end=' ') if len(frontier) > 0: self._trace_fringe(tree, frontier[0]) else: self._trace_fringe(tree) print(']') def _trace_start(self, tree, frontier, text): print('Parsing %r' % " ".join(text)) if self._trace > 2: print('Start:') if self._trace > 1: self._trace_tree(tree, frontier, ' ') def _trace_expand(self, tree, frontier, production): if self._trace > 2: print('Expand: %s' % production) if self._trace > 1: self._trace_tree(tree, frontier, 'E') def _trace_match(self, tree, frontier, tok): if self._trace > 2: print('Match: %r' % tok) if self._trace > 1: self._trace_tree(tree, frontier, 'M') def _trace_succeed(self, tree, frontier): if self._trace > 2: print('GOOD PARSE:') if self._trace == 1: print('Found a parse:\n%s' % tree) if self._trace > 1: self._trace_tree(tree, frontier, '+') def _trace_backtrack(self, tree, frontier, toks=None): if self._trace > 2: if toks: print('Backtrack: %r match failed' % toks[0]) else: print('Backtrack') ##////////////////////////////////////////////////////// ## Stepping Recursive Descent Parser ##////////////////////////////////////////////////////// class SteppingRecursiveDescentParser(RecursiveDescentParser): """ A ``RecursiveDescentParser`` that allows you to step through the parsing process, performing a single operation at a time. The ``initialize`` method is used to start parsing a text. ``expand`` expands the first element on the frontier using a single CFG production, and ``match`` matches the first element on the frontier against the next text token. ``backtrack`` undoes the most recent expand or match operation. ``step`` performs a single expand, match, or backtrack operation. ``parses`` returns the set of parses that have been found by the parser. :ivar _history: A list of ``(rtext, tree, frontier)`` tripples, containing the previous states of the parser. This history is used to implement the ``backtrack`` operation. :ivar _tried_e: A record of all productions that have been tried for a given tree. This record is used by ``expand`` to perform the next untried production. :ivar _tried_m: A record of what tokens have been matched for a given tree. This record is used by ``step`` to decide whether or not to match a token. :see: ``nltk.grammar`` """ def __init__(self, grammar, trace=0): self._grammar = grammar self._trace = trace self._rtext = None self._tree = None self._frontier = [()] self._tried_e = {} self._tried_m = {} self._history = [] self._parses = [] # [XX] TEMPORARY HACK WARNING! This should be replaced with # something nicer when we get the chance. def _freeze(self, tree): c = tree.copy() # for pos in c.treepositions('leaves'): # c[pos] = c[pos].freeze() return ImmutableTree.convert(c) def parse(self, tokens): tokens = list(tokens) self.initialize(tokens) while self.step() is not None: pass return self.parses() def initialize(self, tokens): """ Start parsing a given text. This sets the parser's tree to the start symbol, its frontier to the root node, and its remaining text to ``token['SUBTOKENS']``. """ self._rtext = tokens start = self._grammar.start().symbol() self._tree = Tree(start, []) self._frontier = [()] self._tried_e = {} self._tried_m = {} self._history = [] self._parses = [] if self._trace: self._trace_start(self._tree, self._frontier, self._rtext) def remaining_text(self): """ :return: The portion of the text that is not yet covered by the tree. :rtype: list(str) """ return self._rtext def frontier(self): """ :return: A list of the tree locations of all subtrees that have not yet been expanded, and all leaves that have not yet been matched. :rtype: list(tuple(int)) """ return self._frontier def tree(self): """ :return: A partial structure for the text that is currently being parsed. The elements specified by the frontier have not yet been expanded or matched. :rtype: Tree """ return self._tree def step(self): """ Perform a single parsing operation. If an untried match is possible, then perform the match, and return the matched token. If an untried expansion is possible, then perform the expansion, and return the production that it is based on. If backtracking is possible, then backtrack, and return True. Otherwise, return None. :return: None if no operation was performed; a token if a match was performed; a production if an expansion was performed; and True if a backtrack operation was performed. :rtype: Production or String or bool """ # Try matching (if we haven't already) if self.untried_match(): token = self.match() if token is not None: return token # Try expanding. production = self.expand() if production is not None: return production # Try backtracking if self.backtrack(): self._trace_backtrack(self._tree, self._frontier) return True # Nothing left to do. return None def expand(self, production=None): """ Expand the first element of the frontier. In particular, if the first element of the frontier is a subtree whose node type is equal to ``production``'s left hand side, then add a child to that subtree for each element of ``production``'s right hand side. If ``production`` is not specified, then use the first untried expandable production. If all expandable productions have been tried, do nothing. :return: The production used to expand the frontier, if an expansion was performed. If no expansion was performed, return None. :rtype: Production or None """ # Make sure we *can* expand. if len(self._frontier) == 0: return None if not isinstance(self._tree[self._frontier[0]], Tree): return None # If they didn't specify a production, check all untried ones. if production is None: productions = self.untried_expandable_productions() else: productions = [production] parses = [] for prod in productions: # Record that we've tried this production now. self._tried_e.setdefault(self._freeze(self._tree), []).append(prod) # Try expanding. for _result in self._expand(self._rtext, self._tree, self._frontier, prod): return prod # We didn't expand anything. return None def match(self): """ Match the first element of the frontier. In particular, if the first element of the frontier has the same type as the next text token, then substitute the text token into the tree. :return: The token matched, if a match operation was performed. If no match was performed, return None :rtype: str or None """ # Record that we've tried matching this token. tok = self._rtext[0] self._tried_m.setdefault(self._freeze(self._tree), []).append(tok) # Make sure we *can* match. if len(self._frontier) == 0: return None if isinstance(self._tree[self._frontier[0]], Tree): return None for _result in self._match(self._rtext, self._tree, self._frontier): # Return the token we just matched. return self._history[-1][0][0] return None def backtrack(self): """ Return the parser to its state before the most recent match or expand operation. Calling ``undo`` repeatedly return the parser to successively earlier states. If no match or expand operations have been performed, ``undo`` will make no changes. :return: true if an operation was successfully undone. :rtype: bool """ if len(self._history) == 0: return False (self._rtext, self._tree, self._frontier) = self._history.pop() return True def expandable_productions(self): """ :return: A list of all the productions for which expansions are available for the current parser state. :rtype: list(Production) """ # Make sure we *can* expand. if len(self._frontier) == 0: return [] frontier_child = self._tree[self._frontier[0]] if (len(self._frontier) == 0 or not isinstance(frontier_child, Tree)): return [] return [p for p in self._grammar.productions() if p.lhs().symbol() == frontier_child.label()] def untried_expandable_productions(self): """ :return: A list of all the untried productions for which expansions are available for the current parser state. :rtype: list(Production) """ tried_expansions = self._tried_e.get(self._freeze(self._tree), []) return [p for p in self.expandable_productions() if p not in tried_expansions] def untried_match(self): """ :return: Whether the first element of the frontier is a token that has not yet been matched. :rtype: bool """ if len(self._rtext) == 0: return False tried_matches = self._tried_m.get(self._freeze(self._tree), []) return (self._rtext[0] not in tried_matches) def currently_complete(self): """ :return: Whether the parser's current state represents a complete parse. :rtype: bool """ return (len(self._frontier) == 0 and len(self._rtext) == 0) def _parse(self, remaining_text, tree, frontier): """ A stub version of ``_parse`` that sets the parsers current state to the given arguments. In ``RecursiveDescentParser``, the ``_parse`` method is used to recursively continue parsing a text. ``SteppingRecursiveDescentParser`` overrides it to capture these recursive calls. It records the parser's old state in the history (to allow for backtracking), and updates the parser's new state using the given arguments. Finally, it returns ``[1]``, which is used by ``match`` and ``expand`` to detect whether their operations were successful. :return: ``[1]`` :rtype: list of int """ self._history.append( (self._rtext, self._tree, self._frontier) ) self._rtext = remaining_text self._tree = tree self._frontier = frontier # Is it a good parse? If so, record it. if (len(frontier) == 0 and len(remaining_text) == 0): self._parses.append(tree) self._trace_succeed(self._tree, self._frontier) return [1] def parses(self): """ :return: An iterator of the parses that have been found by this parser so far. :rtype: list of Tree """ return iter(self._parses) def set_grammar(self, grammar): """ Change the grammar used to parse texts. :param grammar: The new grammar. :type grammar: CFG """ self._grammar = grammar ##////////////////////////////////////////////////////// ## Demonstration Code ##////////////////////////////////////////////////////// def demo(): """ A demonstration of the recursive descent parser. """ from nltk import parse, CFG grammar = CFG.fromstring(""" S -> NP VP NP -> Det N | Det N PP VP -> V NP | V NP PP PP -> P NP NP -> 'I' N -> 'man' | 'park' | 'telescope' | 'dog' Det -> 'the' | 'a' P -> 'in' | 'with' V -> 'saw' """) for prod in grammar.productions(): print(prod) sent = 'I saw a man in the park'.split() parser = parse.RecursiveDescentParser(grammar, trace=2) for p in parser.parse(sent): print(p) if __name__ == '__main__': demo() nltk-3.1/nltk/parse/shiftreduce.py0000644000076500000240000004016612607224144017026 0ustar sbstaff00000000000000# Natural Language Toolkit: Shift-Reduce Parser # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals from nltk.grammar import Nonterminal from nltk.tree import Tree from nltk.compat import unicode_repr from nltk.parse.api import ParserI ##////////////////////////////////////////////////////// ## Shift/Reduce Parser ##////////////////////////////////////////////////////// class ShiftReduceParser(ParserI): """ A simple bottom-up CFG parser that uses two operations, "shift" and "reduce", to find a single parse for a text. ``ShiftReduceParser`` maintains a stack, which records the structure of a portion of the text. This stack is a list of strings and Trees that collectively cover a portion of the text. For example, while parsing the sentence "the dog saw the man" with a typical grammar, ``ShiftReduceParser`` will produce the following stack, which covers "the dog saw":: [(NP: (Det: 'the') (N: 'dog')), (V: 'saw')] ``ShiftReduceParser`` attempts to extend the stack to cover the entire text, and to combine the stack elements into a single tree, producing a complete parse for the sentence. Initially, the stack is empty. It is extended to cover the text, from left to right, by repeatedly applying two operations: - "shift" moves a token from the beginning of the text to the end of the stack. - "reduce" uses a CFG production to combine the rightmost stack elements into a single Tree. Often, more than one operation can be performed on a given stack. In this case, ``ShiftReduceParser`` uses the following heuristics to decide which operation to perform: - Only shift if no reductions are available. - If multiple reductions are available, then apply the reduction whose CFG production is listed earliest in the grammar. Note that these heuristics are not guaranteed to choose an operation that leads to a parse of the text. Also, if multiple parses exists, ``ShiftReduceParser`` will return at most one of them. :see: ``nltk.grammar`` """ def __init__(self, grammar, trace=0): """ Create a new ``ShiftReduceParser``, that uses ``grammar`` to parse texts. :type grammar: Grammar :param grammar: The grammar used to parse texts. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. """ self._grammar = grammar self._trace = trace self._check_grammar() def grammar(self): return self._grammar def parse(self, tokens): tokens = list(tokens) self._grammar.check_coverage(tokens) # initialize the stack. stack = [] remaining_text = tokens # Trace output. if self._trace: print('Parsing %r' % " ".join(tokens)) self._trace_stack(stack, remaining_text) # iterate through the text, pushing the token onto # the stack, then reducing the stack. while len(remaining_text) > 0: self._shift(stack, remaining_text) while self._reduce(stack, remaining_text): pass # Did we reduce everything? if len(stack) == 1: # Did we end up with the right category? if stack[0].label() == self._grammar.start().symbol(): yield stack[0] def _shift(self, stack, remaining_text): """ Move a token from the beginning of ``remaining_text`` to the end of ``stack``. :type stack: list(str and Tree) :param stack: A list of strings and Trees, encoding the structure of the text that has been parsed so far. :type remaining_text: list(str) :param remaining_text: The portion of the text that is not yet covered by ``stack``. :rtype: None """ stack.append(remaining_text[0]) remaining_text.remove(remaining_text[0]) if self._trace: self._trace_shift(stack, remaining_text) def _match_rhs(self, rhs, rightmost_stack): """ :rtype: bool :return: true if the right hand side of a CFG production matches the rightmost elements of the stack. ``rhs`` matches ``rightmost_stack`` if they are the same length, and each element of ``rhs`` matches the corresponding element of ``rightmost_stack``. A nonterminal element of ``rhs`` matches any Tree whose node value is equal to the nonterminal's symbol. A terminal element of ``rhs`` matches any string whose type is equal to the terminal. :type rhs: list(terminal and Nonterminal) :param rhs: The right hand side of a CFG production. :type rightmost_stack: list(string and Tree) :param rightmost_stack: The rightmost elements of the parser's stack. """ if len(rightmost_stack) != len(rhs): return False for i in range(len(rightmost_stack)): if isinstance(rightmost_stack[i], Tree): if not isinstance(rhs[i], Nonterminal): return False if rightmost_stack[i].label() != rhs[i].symbol(): return False else: if isinstance(rhs[i], Nonterminal): return False if rightmost_stack[i] != rhs[i]: return False return True def _reduce(self, stack, remaining_text, production=None): """ Find a CFG production whose right hand side matches the rightmost stack elements; and combine those stack elements into a single Tree, with the node specified by the production's left-hand side. If more than one CFG production matches the stack, then use the production that is listed earliest in the grammar. The new Tree replaces the elements in the stack. :rtype: Production or None :return: If a reduction is performed, then return the CFG production that the reduction is based on; otherwise, return false. :type stack: list(string and Tree) :param stack: A list of strings and Trees, encoding the structure of the text that has been parsed so far. :type remaining_text: list(str) :param remaining_text: The portion of the text that is not yet covered by ``stack``. """ if production is None: productions = self._grammar.productions() else: productions = [production] # Try each production, in order. for production in productions: rhslen = len(production.rhs()) # check if the RHS of a production matches the top of the stack if self._match_rhs(production.rhs(), stack[-rhslen:]): # combine the tree to reflect the reduction tree = Tree(production.lhs().symbol(), stack[-rhslen:]) stack[-rhslen:] = [tree] # We reduced something if self._trace: self._trace_reduce(stack, production, remaining_text) return production # We didn't reduce anything return None def trace(self, trace=2): """ Set the level of tracing output that should be generated when parsing a text. :type trace: int :param trace: The trace level. A trace level of ``0`` will generate no tracing output; and higher trace levels will produce more verbose tracing output. :rtype: None """ # 1: just show shifts. # 2: show shifts & reduces # 3: display which tokens & productions are shifed/reduced self._trace = trace def _trace_stack(self, stack, remaining_text, marker=' '): """ Print trace output displaying the given stack and text. :rtype: None :param marker: A character that is printed to the left of the stack. This is used with trace level 2 to print 'S' before shifted stacks and 'R' before reduced stacks. """ s = ' '+marker+' [ ' for elt in stack: if isinstance(elt, Tree): s += unicode_repr(Nonterminal(elt.label())) + ' ' else: s += unicode_repr(elt) + ' ' s += '* ' + ' '.join(remaining_text) + ']' print(s) def _trace_shift(self, stack, remaining_text): """ Print trace output displaying that a token has been shifted. :rtype: None """ if self._trace > 2: print('Shift %r:' % stack[-1]) if self._trace == 2: self._trace_stack(stack, remaining_text, 'S') elif self._trace > 0: self._trace_stack(stack, remaining_text) def _trace_reduce(self, stack, production, remaining_text): """ Print trace output displaying that ``production`` was used to reduce ``stack``. :rtype: None """ if self._trace > 2: rhs = " ".join(production.rhs()) print('Reduce %r <- %s' % (production.lhs(), rhs)) if self._trace == 2: self._trace_stack(stack, remaining_text, 'R') elif self._trace > 1: self._trace_stack(stack, remaining_text) def _check_grammar(self): """ Check to make sure that all of the CFG productions are potentially useful. If any productions can never be used, then print a warning. :rtype: None """ productions = self._grammar.productions() # Any production whose RHS is an extension of another production's RHS # will never be used. for i in range(len(productions)): for j in range(i+1, len(productions)): rhs1 = productions[i].rhs() rhs2 = productions[j].rhs() if rhs1[:len(rhs2)] == rhs2: print('Warning: %r will never be used' % productions[i]) ##////////////////////////////////////////////////////// ## Stepping Shift/Reduce Parser ##////////////////////////////////////////////////////// class SteppingShiftReduceParser(ShiftReduceParser): """ A ``ShiftReduceParser`` that allows you to setp through the parsing process, performing a single operation at a time. It also allows you to change the parser's grammar midway through parsing a text. The ``initialize`` method is used to start parsing a text. ``shift`` performs a single shift operation, and ``reduce`` performs a single reduce operation. ``step`` will perform a single reduce operation if possible; otherwise, it will perform a single shift operation. ``parses`` returns the set of parses that have been found by the parser. :ivar _history: A list of ``(stack, remaining_text)`` pairs, containing all of the previous states of the parser. This history is used to implement the ``undo`` operation. :see: ``nltk.grammar`` """ def __init__(self, grammar, trace=0): self._grammar = grammar self._trace = trace self._stack = None self._remaining_text = None self._history = [] def parse(self, tokens): tokens = list(tokens) self.initialize(tokens) while self.step(): pass return self.parses() def stack(self): """ :return: The parser's stack. :rtype: list(str and Tree) """ return self._stack def remaining_text(self): """ :return: The portion of the text that is not yet covered by the stack. :rtype: list(str) """ return self._remaining_text def initialize(self, tokens): """ Start parsing a given text. This sets the parser's stack to ``[]`` and sets its remaining text to ``tokens``. """ self._stack = [] self._remaining_text = tokens self._history = [] def step(self): """ Perform a single parsing operation. If a reduction is possible, then perform that reduction, and return the production that it is based on. Otherwise, if a shift is possible, then perform it, and return True. Otherwise, return False. :return: False if no operation was performed; True if a shift was performed; and the CFG production used to reduce if a reduction was performed. :rtype: Production or bool """ return self.reduce() or self.shift() def shift(self): """ Move a token from the beginning of the remaining text to the end of the stack. If there are no more tokens in the remaining text, then do nothing. :return: True if the shift operation was successful. :rtype: bool """ if len(self._remaining_text) == 0: return False self._history.append( (self._stack[:], self._remaining_text[:]) ) self._shift(self._stack, self._remaining_text) return True def reduce(self, production=None): """ Use ``production`` to combine the rightmost stack elements into a single Tree. If ``production`` does not match the rightmost stack elements, then do nothing. :return: The production used to reduce the stack, if a reduction was performed. If no reduction was performed, return None. :rtype: Production or None """ self._history.append( (self._stack[:], self._remaining_text[:]) ) return_val = self._reduce(self._stack, self._remaining_text, production) if not return_val: self._history.pop() return return_val def undo(self): """ Return the parser to its state before the most recent shift or reduce operation. Calling ``undo`` repeatedly return the parser to successively earlier states. If no shift or reduce operations have been performed, ``undo`` will make no changes. :return: true if an operation was successfully undone. :rtype: bool """ if len(self._history) == 0: return False (self._stack, self._remaining_text) = self._history.pop() return True def reducible_productions(self): """ :return: A list of the productions for which reductions are available for the current parser state. :rtype: list(Production) """ productions = [] for production in self._grammar.productions(): rhslen = len(production.rhs()) if self._match_rhs(production.rhs(), self._stack[-rhslen:]): productions.append(production) return productions def parses(self): """ :return: An iterator of the parses that have been found by this parser so far. :rtype: iter(Tree) """ if (len(self._remaining_text) == 0 and len(self._stack) == 1 and self._stack[0].label() == self._grammar.start().symbol() ): yield self._stack[0] # copied from nltk.parser def set_grammar(self, grammar): """ Change the grammar used to parse texts. :param grammar: The new grammar. :type grammar: CFG """ self._grammar = grammar ##////////////////////////////////////////////////////// ## Demonstration Code ##////////////////////////////////////////////////////// def demo(): """ A demonstration of the shift-reduce parser. """ from nltk import parse, CFG grammar = CFG.fromstring(""" S -> NP VP NP -> Det N | Det N PP VP -> V NP | V NP PP PP -> P NP NP -> 'I' N -> 'man' | 'park' | 'telescope' | 'dog' Det -> 'the' | 'a' P -> 'in' | 'with' V -> 'saw' """) sent = 'I saw a man in the park'.split() parser = parse.ShiftReduceParser(grammar, trace=2) for p in parser.parse(sent): print(p) if __name__ == '__main__': demo() nltk-3.1/nltk/parse/stanford.py0000644000076500000240000004172212610000304016320 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Interface to the Stanford Parser # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Xu # # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals import tempfile import os import re import warnings from subprocess import PIPE from io import StringIO from nltk import compat from nltk.internals import find_jar, find_jar_iter, config_java, java, _java_options from nltk.parse.api import ParserI from nltk.parse.dependencygraph import DependencyGraph from nltk.tree import Tree _stanford_url = 'http://nlp.stanford.edu/software/lex-parser.shtml' class GenericStanfordParser(ParserI): """Interface to the Stanford Parser""" _MODEL_JAR_PATTERN = r'stanford-parser-(\d+)(\.(\d+))+-models\.jar' _JAR = r'stanford-parser\.jar' _MAIN_CLASS = 'edu.stanford.nlp.parser.lexparser.LexicalizedParser' _USE_STDIN = False _DOUBLE_SPACED_OUTPUT = False def __init__(self, path_to_jar=None, path_to_models_jar=None, model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz', encoding='utf8', verbose=False, java_options='-mx1000m', corenlp_options=''): # find the most recent code and model jar stanford_jar = max( find_jar_iter( self._JAR, path_to_jar, env_vars=('STANFORD_PARSER', 'STANFORD_CORENLP'), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True ), key=lambda model_name: re.match(self._JAR, model_name) ) model_jar=max( find_jar_iter( self._MODEL_JAR_PATTERN, path_to_models_jar, env_vars=('STANFORD_MODELS', 'STANFORD_CORENLP'), searchpath=(), url=_stanford_url, verbose=verbose, is_regex=True ), key=lambda model_name: re.match(self._MODEL_JAR_PATTERN, model_name) ) self._classpath = (stanford_jar, model_jar) self.model_path = model_path self._encoding = encoding self.corenlp_options = corenlp_options self.java_options = java_options def _parse_trees_output(self, output_): res = [] cur_lines = [] cur_trees = [] blank = False for line in output_.splitlines(False): if line == '': if blank: res.append(iter(cur_trees)) cur_trees = [] blank = False elif self._DOUBLE_SPACED_OUTPUT: cur_trees.append(self._make_tree('\n'.join(cur_lines))) cur_lines = [] blank = True else: res.append(iter([self._make_tree('\n'.join(cur_lines))])) cur_lines = [] else: cur_lines.append(line) blank = False return iter(res) def parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences as a list where each sentence is a list of words. Each sentence will be automatically tagged with this StanfordParser instance's tagger. If whitespaces exists inside a token, then the token will be treated as separate tokens. :param sentences: Input sentences to parse :type sentences: list(list(str)) :rtype: iter(iter(Tree)) """ cmd = [ self._MAIN_CLASS, '-model', self.model_path, '-sentences', 'newline', '-outputFormat', self._OUTPUT_FORMAT, '-tokenized', '-escaper', 'edu.stanford.nlp.process.PTBEscapingProcessor', ] return self._parse_trees_output(self._execute( cmd, '\n'.join(' '.join(sentence) for sentence in sentences), verbose)) def raw_parse(self, sentence, verbose=False): """ Use StanfordParser to parse a sentence. Takes a sentence as a string; before parsing, it will be automatically tokenized and tagged by the Stanford Parser. :param sentence: Input sentence to parse :type sentence: str :rtype: iter(Tree) """ return next(self.raw_parse_sents([sentence], verbose)) def raw_parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences as a list of strings. Each sentence will be automatically tokenized and tagged by the Stanford Parser. :param sentences: Input sentences to parse :type sentences: list(str) :rtype: iter(iter(Tree)) """ cmd = [ self._MAIN_CLASS, '-model', self.model_path, '-sentences', 'newline', '-outputFormat', self._OUTPUT_FORMAT, ] return self._parse_trees_output(self._execute(cmd, '\n'.join(sentences), verbose)) def tagged_parse(self, sentence, verbose=False): """ Use StanfordParser to parse a sentence. Takes a sentence as a list of (word, tag) tuples; the sentence must have already been tokenized and tagged. :param sentence: Input sentence to parse :type sentence: list(tuple(str, str)) :rtype: iter(Tree) """ return next(self.tagged_parse_sents([sentence], verbose)) def tagged_parse_sents(self, sentences, verbose=False): """ Use StanfordParser to parse multiple sentences. Takes multiple sentences where each sentence is a list of (word, tag) tuples. The sentences must have already been tokenized and tagged. :param sentences: Input sentences to parse :type sentences: list(list(tuple(str, str))) :rtype: iter(iter(Tree)) """ tag_separator = '/' cmd = [ self._MAIN_CLASS, '-model', self.model_path, '-sentences', 'newline', '-outputFormat', self._OUTPUT_FORMAT, '-tokenized', '-tagSeparator', tag_separator, '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', '-tokenizerMethod', 'newCoreLabelTokenizerFactory', ] # We don't need to escape slashes as "splitting is done on the last instance of the character in the token" return self._parse_trees_output(self._execute( cmd, '\n'.join(' '.join(tag_separator.join(tagged) for tagged in sentence) for sentence in sentences), verbose)) def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-encoding', encoding]) if self.corenlp_options: cmd.append(self.corenlp_options) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, compat.text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() # Run the tagger and get the output. if self._USE_STDIN: input_file.seek(0) stdout, stderr = java(cmd, classpath=self._classpath, stdin=input_file, stdout=PIPE, stderr=PIPE) else: cmd.append(input_file.name) stdout, stderr = java(cmd, classpath=self._classpath, stdout=PIPE, stderr=PIPE) stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout class StanfordParser(GenericStanfordParser): """ >>> parser=StanfordParser( ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ... ) >>> list(parser.raw_parse("the quick brown fox jumps over the lazy dog")) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])])] >>> sum([list(dep_graphs) for dep_graphs in parser.raw_parse_sents(( ... "the quick brown fox jumps over the lazy dog", ... "the quick grey wolf jumps over the lazy fox" ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])])])]), Tree('ROOT', [Tree('NP', [Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['quick']), Tree('JJ', ['grey']), Tree('NN', ['wolf'])]), Tree('NP', [Tree('NP', [Tree('NNS', ['jumps'])]), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['fox'])])])])])])] >>> sum([list(dep_graphs) for dep_graphs in parser.parse_sents(( ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('PRP', ['I'])]), Tree('VP', [Tree('VBP', ["'m"]), Tree('NP', [Tree('DT', ['a']), Tree('NN', ['dog'])])])])]), Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['This'])]), Tree('VP', [Tree('VBZ', ['is']), Tree('NP', [Tree('NP', [Tree('NP', [Tree('PRP$', ['my']), Tree('NNS', ['friends']), Tree('POS', ["'"])]), Tree('NN', ['cat'])]), Tree('PRN', [Tree('-LRB-', ['-LRB-']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['tabby'])]), Tree('-RRB-', ['-RRB-'])])])])])])] >>> sum([list(dep_graphs) for dep_graphs in parser.tagged_parse_sents(( ... ( ... ("The", "DT"), ... ("quick", "JJ"), ... ("brown", "JJ"), ... ("fox", "NN"), ... ("jumped", "VBD"), ... ("over", "IN"), ... ("the", "DT"), ... ("lazy", "JJ"), ... ("dog", "NN"), ... (".", "."), ... ), ... ))],[]) # doctest: +NORMALIZE_WHITESPACE [Tree('ROOT', [Tree('S', [Tree('NP', [Tree('DT', ['The']), Tree('JJ', ['quick']), Tree('JJ', ['brown']), Tree('NN', ['fox'])]), Tree('VP', [Tree('VBD', ['jumped']), Tree('PP', [Tree('IN', ['over']), Tree('NP', [Tree('DT', ['the']), Tree('JJ', ['lazy']), Tree('NN', ['dog'])])])]), Tree('.', ['.'])])])] """ _OUTPUT_FORMAT = 'penn' def _make_tree(self, result): return Tree.fromstring(result) class StanfordDependencyParser(GenericStanfordParser): """ >>> dep_parser=StanfordDependencyParser( ... model_path="edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz" ... ) >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( ... "The quick brown fox jumps over the lazy dog.", ... "The quick grey wolf jumps over the lazy fox." ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] >>> sum([[list(parse.triples()) for parse in dep_graphs] for dep_graphs in dep_parser.tagged_parse_sents(( ... ( ... ("The", "DT"), ... ("quick", "JJ"), ... ("brown", "JJ"), ... ("fox", "NN"), ... ("jumped", "VBD"), ... ("over", "IN"), ... ("the", "DT"), ... ("lazy", "JJ"), ... ("dog", "NN"), ... (".", "."), ... ), ... ))],[]) # doctest: +NORMALIZE_WHITESPACE [[((u'jumped', u'VBD'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), ((u'jumped', u'VBD'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] """ _OUTPUT_FORMAT = 'conll2007' def _make_tree(self, result): return DependencyGraph(result, top_relation_label='root') class StanfordNeuralDependencyParser(GenericStanfordParser): ''' >>> from nltk.parse.stanford import StanfordNeuralDependencyParser >>> dep_parser=StanfordNeuralDependencyParser() >>> [parse.tree() for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])])] >>> [list(parse.triples()) for parse in dep_parser.raw_parse("The quick brown fox jumps over the lazy dog.")] # doctest: +NORMALIZE_WHITESPACE [[((u'jumps', u'VBZ'), u'nsubj', (u'fox', u'NN')), ((u'fox', u'NN'), u'det', (u'The', u'DT')), ((u'fox', u'NN'), u'amod', (u'quick', u'JJ')), ((u'fox', u'NN'), u'amod', (u'brown', u'JJ')), ((u'jumps', u'VBZ'), u'nmod', (u'dog', u'NN')), ((u'dog', u'NN'), u'case', (u'over', u'IN')), ((u'dog', u'NN'), u'det', (u'the', u'DT')), ((u'dog', u'NN'), u'amod', (u'lazy', u'JJ'))]] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.raw_parse_sents(( ... "The quick brown fox jumps over the lazy dog.", ... "The quick grey wolf jumps over the lazy fox." ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('jumps', [Tree('fox', ['The', 'quick', 'brown']), Tree('dog', ['over', 'the', 'lazy'])]), Tree('jumps', [Tree('wolf', ['The', 'quick', 'grey']), Tree('fox', ['over', 'the', 'lazy'])])] >>> sum([[parse.tree() for parse in dep_graphs] for dep_graphs in dep_parser.parse_sents(( ... "I 'm a dog".split(), ... "This is my friends ' cat ( the tabby )".split(), ... ))], []) # doctest: +NORMALIZE_WHITESPACE [Tree('dog', ['I', "'m", 'a']), Tree('cat', ['This', 'is', Tree('friends', ['my', "'"]), Tree('tabby', ['the'])])] ''' _OUTPUT_FORMAT = 'conll' _MAIN_CLASS = 'edu.stanford.nlp.pipeline.StanfordCoreNLP' _JAR = r'stanford-corenlp-(\d+)(\.(\d+))+\.jar' _MODEL_JAR_PATTERN = r'stanford-corenlp-(\d+)(\.(\d+))+-models\.jar' _USE_STDIN = True _DOUBLE_SPACED_OUTPUT = True def __init__(self, *args, **kwargs): super(StanfordNeuralDependencyParser, self).__init__(*args, **kwargs) self.corenlp_options += '-annotators tokenize,ssplit,pos,depparse' def tagged_parse_sents(self, sentences, verbose=False): ''' Currently unimplemented because the neural dependency parser (and the StanfordCoreNLP pipeline class) doesn't support passing in pre- tagged tokens. ''' raise NotImplementedError( 'tagged_parse[_sents] is not supported by ' 'StanfordNeuralDependencyParser; use ' 'parse[_sents] or raw_parse[_sents] instead.' ) def _make_tree(self, result): return DependencyGraph(result, top_relation_label='ROOT') def setup_module(module): from nose import SkipTest try: StanfordParser( model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz' ) StanfordNeuralDependencyParser() except LookupError: raise SkipTest('doctests from nltk.parse.stanford are skipped because one of the stanford parser or CoreNLP jars doesn\'t exist') nltk-3.1/nltk/parse/transitionparser.py0000644000076500000240000007477312607224144020143 0ustar sbstaff00000000000000# Natural Language Toolkit: Arc-Standard and Arc-eager Transition Based Parsers # # Author: Long Duong # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import absolute_import from __future__ import division from __future__ import print_function import tempfile import pickle from os import remove from copy import deepcopy from operator import itemgetter try: from numpy import array from scipy import sparse from sklearn.datasets import load_svmlight_file from sklearn import svm except ImportError: pass from nltk.parse import ParserI, DependencyGraph, DependencyEvaluator class Configuration(object): """ Class for holding configuration which is the partial analysis of the input sentence. The transition based parser aims at finding set of operators that transfer the initial configuration to the terminal configuration. The configuration includes: - Stack: for storing partially proceeded words - Buffer: for storing remaining input words - Set of arcs: for storing partially built dependency tree This class also provides a method to represent a configuration as list of features. """ def __init__(self, dep_graph): """ :param dep_graph: the representation of an input in the form of dependency graph. :type dep_graph: DependencyGraph where the dependencies are not specified. """ # dep_graph.nodes contain list of token for a sentence self.stack = [0] # The root element self.buffer = list(range(1, len(dep_graph.nodes))) # The rest is in the buffer self.arcs = [] # empty set of arc self._tokens = dep_graph.nodes self._max_address = len(self.buffer) def __str__(self): return 'Stack : ' + \ str(self.stack) + ' Buffer : ' + str(self.buffer) + ' Arcs : ' + str(self.arcs) def _check_informative(self, feat, flag=False): """ Check whether a feature is informative The flag control whether "_" is informative or not """ if feat is None: return False if feat == '': return False if flag is False: if feat == '_': return False return True def extract_features(self): """ Extract the set of features for the current configuration. Implement standard features as describe in Table 3.2 (page 31) in Dependency Parsing book by Sandra Kubler, Ryan McDonal, Joakim Nivre. Please note that these features are very basic. :return: list(str) """ result = [] # Todo : can come up with more complicated features set for better # performance. if len(self.stack) > 0: # Stack 0 stack_idx0 = self.stack[len(self.stack) - 1] token = self._tokens[stack_idx0] if self._check_informative(token['word'], True): result.append('STK_0_FORM_' + token['word']) if 'lemma' in token and self._check_informative(token['lemma']): result.append('STK_0_LEMMA_' + token['lemma']) if self._check_informative(token['tag']): result.append('STK_0_POS_' + token['tag']) if 'feats' in token and self._check_informative(token['feats']): feats = token['feats'].split("|") for feat in feats: result.append('STK_0_FEATS_' + feat) # Stack 1 if len(self.stack) > 1: stack_idx1 = self.stack[len(self.stack) - 2] token = self._tokens[stack_idx1] if self._check_informative(token['tag']): result.append('STK_1_POS_' + token['tag']) # Left most, right most dependency of stack[0] left_most = 1000000 right_most = -1 dep_left_most = '' dep_right_most = '' for (wi, r, wj) in self.arcs: if wi == stack_idx0: if (wj > wi) and (wj > right_most): right_most = wj dep_right_most = r if (wj < wi) and (wj < left_most): left_most = wj dep_left_most = r if self._check_informative(dep_left_most): result.append('STK_0_LDEP_' + dep_left_most) if self._check_informative(dep_right_most): result.append('STK_0_RDEP_' + dep_right_most) # Check Buffered 0 if len(self.buffer) > 0: # Buffer 0 buffer_idx0 = self.buffer[0] token = self._tokens[buffer_idx0] if self._check_informative(token['word'], True): result.append('BUF_0_FORM_' + token['word']) if 'lemma' in token and self._check_informative(token['lemma']): result.append('BUF_0_LEMMA_' + token['lemma']) if self._check_informative(token['tag']): result.append('BUF_0_POS_' + token['tag']) if 'feats' in token and self._check_informative(token['feats']): feats = token['feats'].split("|") for feat in feats: result.append('BUF_0_FEATS_' + feat) # Buffer 1 if len(self.buffer) > 1: buffer_idx1 = self.buffer[1] token = self._tokens[buffer_idx1] if self._check_informative(token['word'], True): result.append('BUF_1_FORM_' + token['word']) if self._check_informative(token['tag']): result.append('BUF_1_POS_' + token['tag']) if len(self.buffer) > 2: buffer_idx2 = self.buffer[2] token = self._tokens[buffer_idx2] if self._check_informative(token['tag']): result.append('BUF_2_POS_' + token['tag']) if len(self.buffer) > 3: buffer_idx3 = self.buffer[3] token = self._tokens[buffer_idx3] if self._check_informative(token['tag']): result.append('BUF_3_POS_' + token['tag']) # Left most, right most dependency of stack[0] left_most = 1000000 right_most = -1 dep_left_most = '' dep_right_most = '' for (wi, r, wj) in self.arcs: if wi == buffer_idx0: if (wj > wi) and (wj > right_most): right_most = wj dep_right_most = r if (wj < wi) and (wj < left_most): left_most = wj dep_left_most = r if self._check_informative(dep_left_most): result.append('BUF_0_LDEP_' + dep_left_most) if self._check_informative(dep_right_most): result.append('BUF_0_RDEP_' + dep_right_most) return result class Transition(object): """ This class defines a set of transition which is applied to a configuration to get another configuration Note that for different parsing algorithm, the transition is different. """ # Define set of transitions LEFT_ARC = 'LEFTARC' RIGHT_ARC = 'RIGHTARC' SHIFT = 'SHIFT' REDUCE = 'REDUCE' def __init__(self, alg_option): """ :param alg_option: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm :type alg_option: str """ self._algo = alg_option if alg_option not in [ TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER]: raise ValueError(" Currently we only support %s and %s " % (TransitionParser.ARC_STANDARD, TransitionParser.ARC_EAGER)) def left_arc(self, conf, relation): """ Note that the algorithm for left-arc is quite similar except for precondition for both arc-standard and arc-eager :param configuration: is the current configuration :return : A new configuration or -1 if the pre-condition is not satisfied """ if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): return -1 if conf.buffer[0] == 0: # here is the Root element return -1 idx_wi = conf.stack[len(conf.stack) - 1] flag = True if self._algo == TransitionParser.ARC_EAGER: for (idx_parent, r, idx_child) in conf.arcs: if idx_child == idx_wi: flag = False if flag: conf.stack.pop() idx_wj = conf.buffer[0] conf.arcs.append((idx_wj, relation, idx_wi)) else: return -1 def right_arc(self, conf, relation): """ Note that the algorithm for right-arc is DIFFERENT for arc-standard and arc-eager :param configuration: is the current configuration :return : A new configuration or -1 if the pre-condition is not satisfied """ if (len(conf.buffer) <= 0) or (len(conf.stack) <= 0): return -1 if self._algo == TransitionParser.ARC_STANDARD: idx_wi = conf.stack.pop() idx_wj = conf.buffer[0] conf.buffer[0] = idx_wi conf.arcs.append((idx_wi, relation, idx_wj)) else: # arc-eager idx_wi = conf.stack[len(conf.stack) - 1] idx_wj = conf.buffer.pop(0) conf.stack.append(idx_wj) conf.arcs.append((idx_wi, relation, idx_wj)) def reduce(self, conf): """ Note that the algorithm for reduce is only available for arc-eager :param configuration: is the current configuration :return : A new configuration or -1 if the pre-condition is not satisfied """ if self._algo != TransitionParser.ARC_EAGER: return -1 if len(conf.stack) <= 0: return -1 idx_wi = conf.stack[len(conf.stack) - 1] flag = False for (idx_parent, r, idx_child) in conf.arcs: if idx_child == idx_wi: flag = True if flag: conf.stack.pop() # reduce it else: return -1 def shift(self, conf): """ Note that the algorithm for shift is the SAME for arc-standard and arc-eager :param configuration: is the current configuration :return : A new configuration or -1 if the pre-condition is not satisfied """ if len(conf.buffer) <= 0: return -1 idx_wi = conf.buffer.pop(0) conf.stack.append(idx_wi) class TransitionParser(ParserI): """ Class for transition based parser. Implement 2 algorithms which are "arc-standard" and "arc-eager" """ ARC_STANDARD = 'arc-standard' ARC_EAGER = 'arc-eager' def __init__(self, algorithm): """ :param algorithm: the algorithm option of this parser. Currently support `arc-standard` and `arc-eager` algorithm :type algorithm: str """ if not(algorithm in [self.ARC_STANDARD, self.ARC_EAGER]): raise ValueError(" Currently we only support %s and %s " % (self.ARC_STANDARD, self.ARC_EAGER)) self._algorithm = algorithm self._dictionary = {} self._transition = {} self._match_transition = {} def _get_dep_relation(self, idx_parent, idx_child, depgraph): p_node = depgraph.nodes[idx_parent] c_node = depgraph.nodes[idx_child] if c_node['word'] is None: return None # Root word if c_node['head'] == p_node['address']: return c_node['rel'] else: return None def _convert_to_binary_features(self, features): """ :param features: list of feature string which is needed to convert to binary features :type features: list(str) :return : string of binary features in libsvm format which is 'featureID:value' pairs """ unsorted_result = [] for feature in features: self._dictionary.setdefault(feature, len(self._dictionary)) unsorted_result.append(self._dictionary[feature]) # Default value of each feature is 1.0 return ' '.join(str(featureID) + ':1.0' for featureID in sorted(unsorted_result)) def _is_projective(self, depgraph): arc_list = [] for key in depgraph.nodes: node = depgraph.nodes[key] if 'head' in node: childIdx = node['address'] parentIdx = node['head'] if parentIdx is not None: arc_list.append((parentIdx, childIdx)) for (parentIdx, childIdx) in arc_list: # Ensure that childIdx < parentIdx if childIdx > parentIdx: temp = childIdx childIdx = parentIdx parentIdx = temp for k in range(childIdx + 1, parentIdx): for m in range(len(depgraph.nodes)): if (m < childIdx) or (m > parentIdx): if (k, m) in arc_list: return False if (m, k) in arc_list: return False return True def _write_to_file(self, key, binary_features, input_file): """ write the binary features to input file and update the transition dictionary """ self._transition.setdefault(key, len(self._transition) + 1) self._match_transition[self._transition[key]] = key input_str = str(self._transition[key]) + ' ' + binary_features + '\n' input_file.write(input_str.encode('utf-8')) def _create_training_examples_arc_std(self, depgraphs, input_file): """ Create the training example in the libsvm format and write it to the input_file. Reference : Page 32, Chapter 3. Dependency Parsing by Sandra Kubler, Ryan McDonal and Joakim Nivre (2009) """ operation = Transition(self.ARC_STANDARD) count_proj = 0 training_seq = [] for depgraph in depgraphs: if not self._is_projective(depgraph): continue count_proj += 1 conf = Configuration(depgraph) while len(conf.buffer) > 0: b0 = conf.buffer[0] features = conf.extract_features() binary_features = self._convert_to_binary_features(features) if len(conf.stack) > 0: s0 = conf.stack[len(conf.stack) - 1] # Left-arc operation rel = self._get_dep_relation(b0, s0, depgraph) if rel is not None: key = Transition.LEFT_ARC + ':' + rel self._write_to_file(key, binary_features, input_file) operation.left_arc(conf, rel) training_seq.append(key) continue # Right-arc operation rel = self._get_dep_relation(s0, b0, depgraph) if rel is not None: precondition = True # Get the max-index of buffer maxID = conf._max_address for w in range(maxID + 1): if w != b0: relw = self._get_dep_relation(b0, w, depgraph) if relw is not None: if (b0, relw, w) not in conf.arcs: precondition = False if precondition: key = Transition.RIGHT_ARC + ':' + rel self._write_to_file( key, binary_features, input_file) operation.right_arc(conf, rel) training_seq.append(key) continue # Shift operation as the default key = Transition.SHIFT self._write_to_file(key, binary_features, input_file) operation.shift(conf) training_seq.append(key) print(" Number of training examples : " + str(len(depgraphs))) print(" Number of valid (projective) examples : " + str(count_proj)) return training_seq def _create_training_examples_arc_eager(self, depgraphs, input_file): """ Create the training example in the libsvm format and write it to the input_file. Reference : 'A Dynamic Oracle for Arc-Eager Dependency Parsing' by Joav Goldberg and Joakim Nivre """ operation = Transition(self.ARC_EAGER) countProj = 0 training_seq = [] for depgraph in depgraphs: if not self._is_projective(depgraph): continue countProj += 1 conf = Configuration(depgraph) while len(conf.buffer) > 0: b0 = conf.buffer[0] features = conf.extract_features() binary_features = self._convert_to_binary_features(features) if len(conf.stack) > 0: s0 = conf.stack[len(conf.stack) - 1] # Left-arc operation rel = self._get_dep_relation(b0, s0, depgraph) if rel is not None: key = Transition.LEFT_ARC + ':' + rel self._write_to_file(key, binary_features, input_file) operation.left_arc(conf, rel) training_seq.append(key) continue # Right-arc operation rel = self._get_dep_relation(s0, b0, depgraph) if rel is not None: key = Transition.RIGHT_ARC + ':' + rel self._write_to_file(key, binary_features, input_file) operation.right_arc(conf, rel) training_seq.append(key) continue # reduce operation flag = False for k in range(s0): if self._get_dep_relation(k, b0, depgraph) is not None: flag = True if self._get_dep_relation(b0, k, depgraph) is not None: flag = True if flag: key = Transition.REDUCE self._write_to_file(key, binary_features, input_file) operation.reduce(conf) training_seq.append(key) continue # Shift operation as the default key = Transition.SHIFT self._write_to_file(key, binary_features, input_file) operation.shift(conf) training_seq.append(key) print(" Number of training examples : " + str(len(depgraphs))) print(" Number of valid (projective) examples : " + str(countProj)) return training_seq def train(self, depgraphs, modelfile): """ :param depgraphs : list of DependencyGraph as the training data :type depgraphs : DependencyGraph :param modelfile : file name to save the trained model :type modelfile : str """ try: input_file = tempfile.NamedTemporaryFile( prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) if self._algorithm == self.ARC_STANDARD: self._create_training_examples_arc_std(depgraphs, input_file) else: self._create_training_examples_arc_eager(depgraphs, input_file) input_file.close() # Using the temporary file to train the libsvm classifier x_train, y_train = load_svmlight_file(input_file.name) # The parameter is set according to the paper: # Algorithms for Deterministic Incremental Dependency Parsing by Joakim Nivre # Todo : because of probability = True => very slow due to # cross-validation. Need to improve the speed here model = svm.SVC( kernel='poly', degree=2, coef0=0, gamma=0.2, C=0.5, verbose=True, probability=True) model.fit(x_train, y_train) # Save the model to file name (as pickle) pickle.dump(model, open(modelfile, 'wb')) finally: remove(input_file.name) def parse(self, depgraphs, modelFile): """ :param depgraphs: the list of test sentence, each sentence is represented as a dependency graph where the 'head' information is dummy :type depgraphs: list(DependencyGraph) :param modelfile: the model file :type modelfile: str :return: list (DependencyGraph) with the 'head' and 'rel' information """ result = [] # First load the model model = pickle.load(open(modelFile, 'rb')) operation = Transition(self._algorithm) for depgraph in depgraphs: conf = Configuration(depgraph) while len(conf.buffer) > 0: features = conf.extract_features() col = [] row = [] data = [] for feature in features: if feature in self._dictionary: col.append(self._dictionary[feature]) row.append(0) data.append(1.0) np_col = array(sorted(col)) # NB : index must be sorted np_row = array(row) np_data = array(data) x_test = sparse.csr_matrix((np_data, (np_row, np_col)), shape=(1, len(self._dictionary))) # It's best to use decision function as follow BUT it's not supported yet for sparse SVM # Using decision funcion to build the votes array #dec_func = model.decision_function(x_test)[0] #votes = {} #k = 0 # for i in range(len(model.classes_)): # for j in range(i+1, len(model.classes_)): # #if dec_func[k] > 0: # votes.setdefault(i,0) # votes[i] +=1 # else: # votes.setdefault(j,0) # votes[j] +=1 # k +=1 # Sort votes according to the values #sorted_votes = sorted(votes.items(), key=itemgetter(1), reverse=True) # We will use predict_proba instead of decision_function prob_dict = {} pred_prob = model.predict_proba(x_test)[0] for i in range(len(pred_prob)): prob_dict[i] = pred_prob[i] sorted_Prob = sorted( prob_dict.items(), key=itemgetter(1), reverse=True) # Note that SHIFT is always a valid operation for (y_pred_idx, confidence) in sorted_Prob: #y_pred = model.predict(x_test)[0] # From the prediction match to the operation y_pred = model.classes_[y_pred_idx] if y_pred in self._match_transition: strTransition = self._match_transition[y_pred] baseTransition = strTransition.split(":")[0] if baseTransition == Transition.LEFT_ARC: if operation.left_arc(conf, strTransition.split(":")[1]) != -1: break elif baseTransition == Transition.RIGHT_ARC: if operation.right_arc(conf, strTransition.split(":")[1]) != -1: break elif baseTransition == Transition.REDUCE: if operation.reduce(conf) != -1: break elif baseTransition == Transition.SHIFT: if operation.shift(conf) != -1: break else: raise ValueError("The predicted transition is not recognized, expected errors") # Finish with operations build the dependency graph from Conf.arcs new_depgraph = deepcopy(depgraph) for key in new_depgraph.nodes: node = new_depgraph.nodes[key] node['rel'] = '' # With the default, all the token depend on the Root node['head'] = 0 for (head, rel, child) in conf.arcs: c_node = new_depgraph.nodes[child] c_node['head'] = head c_node['rel'] = rel result.append(new_depgraph) return result def demo(): """ >>> from nltk.parse import DependencyGraph, DependencyEvaluator >>> from nltk.parse.transitionparser import TransitionParser, Configuration, Transition >>> gold_sent = DependencyGraph(\""" ... Economic JJ 2 ATT ... news NN 3 SBJ ... has VBD 0 ROOT ... little JJ 5 ATT ... effect NN 3 OBJ ... on IN 5 ATT ... financial JJ 8 ATT ... markets NNS 6 PC ... . . 3 PU ... \""") >>> conf = Configuration(gold_sent) ###################### Check the Initial Feature ######################## >>> print(', '.join(conf.extract_features())) STK_0_POS_TOP, BUF_0_FORM_Economic, BUF_0_LEMMA_Economic, BUF_0_POS_JJ, BUF_1_FORM_news, BUF_1_POS_NN, BUF_2_POS_VBD, BUF_3_POS_JJ ###################### Check The Transition ####################### Check the Initialized Configuration >>> print(conf) Stack : [0] Buffer : [1, 2, 3, 4, 5, 6, 7, 8, 9] Arcs : [] A. Do some transition checks for ARC-STANDARD >>> operation = Transition('arc-standard') >>> operation.shift(conf) >>> operation.left_arc(conf, "ATT") >>> operation.shift(conf) >>> operation.left_arc(conf,"SBJ") >>> operation.shift(conf) >>> operation.shift(conf) >>> operation.left_arc(conf, "ATT") >>> operation.shift(conf) >>> operation.shift(conf) >>> operation.shift(conf) >>> operation.left_arc(conf, "ATT") Middle Configuration and Features Check >>> print(conf) Stack : [0, 3, 5, 6] Buffer : [8, 9] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7)] >>> print(', '.join(conf.extract_features())) STK_0_FORM_on, STK_0_LEMMA_on, STK_0_POS_IN, STK_1_POS_NN, BUF_0_FORM_markets, BUF_0_LEMMA_markets, BUF_0_POS_NNS, BUF_1_FORM_., BUF_1_POS_., BUF_0_LDEP_ATT >>> operation.right_arc(conf, "PC") >>> operation.right_arc(conf, "ATT") >>> operation.right_arc(conf, "OBJ") >>> operation.shift(conf) >>> operation.right_arc(conf, "PU") >>> operation.right_arc(conf, "ROOT") >>> operation.shift(conf) Terminated Configuration Check >>> print(conf) Stack : [0] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (5, 'ATT', 4), (8, 'ATT', 7), (6, 'PC', 8), (5, 'ATT', 6), (3, 'OBJ', 5), (3, 'PU', 9), (0, 'ROOT', 3)] B. Do some transition checks for ARC-EAGER >>> conf = Configuration(gold_sent) >>> operation = Transition('arc-eager') >>> operation.shift(conf) >>> operation.left_arc(conf,'ATT') >>> operation.shift(conf) >>> operation.left_arc(conf,'SBJ') >>> operation.right_arc(conf,'ROOT') >>> operation.shift(conf) >>> operation.left_arc(conf,'ATT') >>> operation.right_arc(conf,'OBJ') >>> operation.right_arc(conf,'ATT') >>> operation.shift(conf) >>> operation.left_arc(conf,'ATT') >>> operation.right_arc(conf,'PC') >>> operation.reduce(conf) >>> operation.reduce(conf) >>> operation.reduce(conf) >>> operation.right_arc(conf,'PU') >>> print(conf) Stack : [0, 3, 9] Buffer : [] Arcs : [(2, 'ATT', 1), (3, 'SBJ', 2), (0, 'ROOT', 3), (5, 'ATT', 4), (3, 'OBJ', 5), (5, 'ATT', 6), (8, 'ATT', 7), (6, 'PC', 8), (3, 'PU', 9)] ###################### Check The Training Function ####################### A. Check the ARC-STANDARD training >>> import tempfile >>> import os >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(), delete=False) >>> parser_std = TransitionParser('arc-standard') >>> print(', '.join(parser_std._create_training_examples_arc_std([gold_sent], input_file))) Number of training examples : 1 Number of valid (projective) examples : 1 SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, SHIFT, SHIFT, LEFTARC:ATT, SHIFT, SHIFT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, RIGHTARC:ATT, RIGHTARC:OBJ, SHIFT, RIGHTARC:PU, RIGHTARC:ROOT, SHIFT >>> parser_std.train([gold_sent],'temp.arcstd.model') Number of training examples : 1 Number of valid (projective) examples : 1 ... >>> remove(input_file.name) B. Check the ARC-EAGER training >>> input_file = tempfile.NamedTemporaryFile(prefix='transition_parse.train', dir=tempfile.gettempdir(),delete=False) >>> parser_eager = TransitionParser('arc-eager') >>> print(', '.join(parser_eager._create_training_examples_arc_eager([gold_sent], input_file))) Number of training examples : 1 Number of valid (projective) examples : 1 SHIFT, LEFTARC:ATT, SHIFT, LEFTARC:SBJ, RIGHTARC:ROOT, SHIFT, LEFTARC:ATT, RIGHTARC:OBJ, RIGHTARC:ATT, SHIFT, LEFTARC:ATT, RIGHTARC:PC, REDUCE, REDUCE, REDUCE, RIGHTARC:PU >>> parser_eager.train([gold_sent],'temp.arceager.model') Number of training examples : 1 Number of valid (projective) examples : 1 ... >>> remove(input_file.name) ###################### Check The Parsing Function ######################## A. Check the ARC-STANDARD parser >>> result = parser_std.parse([gold_sent], 'temp.arcstd.model') >>> de = DependencyEvaluator(result, [gold_sent]) >>> de.eval() >= (0, 0) True B. Check the ARC-EAGER parser >>> result = parser_eager.parse([gold_sent], 'temp.arceager.model') >>> de = DependencyEvaluator(result, [gold_sent]) >>> de.eval() >= (0, 0) True Note that result is very poor because of only one training example. """ nltk-3.1/nltk/parse/util.py0000644000076500000240000002043312607224144015471 0ustar sbstaff00000000000000# Natural Language Toolkit: Parser Utility Functions # # Author: Ewan Klein # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT """ Utility functions for parsers. """ from __future__ import print_function from nltk.grammar import CFG, FeatureGrammar, PCFG from nltk.data import load from nltk.parse.chart import Chart, ChartParser from nltk.parse.pchart import InsideChartParser from nltk.parse.featurechart import FeatureChart, FeatureChartParser def load_parser(grammar_url, trace=0, parser=None, chart_class=None, beam_size=0, **load_args): """ Load a grammar from a file, and build a parser based on that grammar. The parser depends on the grammar format, and might also depend on properties of the grammar itself. The following grammar formats are currently supported: - ``'cfg'`` (CFGs: ``CFG``) - ``'pcfg'`` (probabilistic CFGs: ``PCFG``) - ``'fcfg'`` (feature-based CFGs: ``FeatureGrammar``) :type grammar_url: str :param grammar_url: A URL specifying where the grammar is located. The default protocol is ``"nltk:"``, which searches for the file in the the NLTK data package. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. :param parser: The class used for parsing; should be ``ChartParser`` or a subclass. If None, the class depends on the grammar format. :param chart_class: The class used for storing the chart; should be ``Chart`` or a subclass. Only used for CFGs and feature CFGs. If None, the chart class depends on the grammar format. :type beam_size: int :param beam_size: The maximum length for the parser's edge queue. Only used for probabilistic CFGs. :param load_args: Keyword parameters used when loading the grammar. See ``data.load`` for more information. """ grammar = load(grammar_url, **load_args) if not isinstance(grammar, CFG): raise ValueError("The grammar must be a CFG, " "or a subclass thereof.") if isinstance(grammar, PCFG): if parser is None: parser = InsideChartParser return parser(grammar, trace=trace, beam_size=beam_size) elif isinstance(grammar, FeatureGrammar): if parser is None: parser = FeatureChartParser if chart_class is None: chart_class = FeatureChart return parser(grammar, trace=trace, chart_class=chart_class) else: # Plain CFG. if parser is None: parser = ChartParser if chart_class is None: chart_class = Chart return parser(grammar, trace=trace, chart_class=chart_class) def taggedsent_to_conll(sentence): """ A module to convert a single POS tagged sentence into CONLL format. >>> from nltk import word_tokenize, pos_tag >>> text = "This is a foobar sentence." >>> for line in taggedsent_to_conll(pos_tag(word_tokenize(text))): ... print(line, end="") 1 This _ DT DT _ 0 a _ _ 2 is _ VBZ VBZ _ 0 a _ _ 3 a _ DT DT _ 0 a _ _ 4 foobar _ JJ JJ _ 0 a _ _ 5 sentence _ NN NN _ 0 a _ _ 6 . _ . . _ 0 a _ _ :param sentence: A single input sentence to parse :type sentence: list(tuple(str, str)) :rtype: iter(str) :return: a generator yielding a single sentence in CONLL format. """ for (i, (word, tag)) in enumerate(sentence, start=1): input_str = [str(i), word, '_', tag, tag, '_', '0', 'a', '_', '_'] input_str = "\t".join(input_str) + "\n" yield input_str def taggedsents_to_conll(sentences): """ A module to convert the a POS tagged document stream (i.e. list of list of tuples, a list of sentences) and yield lines in CONLL format. This module yields one line per word and two newlines for end of sentence. >>> from nltk import word_tokenize, sent_tokenize, pos_tag >>> text = "This is a foobar sentence. Is that right?" >>> sentences = [pos_tag(word_tokenize(sent)) for sent in sent_tokenize(text)] >>> for line in taggedsents_to_conll(sentences): ... if line: ... print(line, end="") 1 This _ DT DT _ 0 a _ _ 2 is _ VBZ VBZ _ 0 a _ _ 3 a _ DT DT _ 0 a _ _ 4 foobar _ JJ JJ _ 0 a _ _ 5 sentence _ NN NN _ 0 a _ _ 6 . _ . . _ 0 a _ _ 1 Is _ VBZ VBZ _ 0 a _ _ 2 that _ IN IN _ 0 a _ _ 3 right _ NN NN _ 0 a _ _ 4 ? _ . . _ 0 a _ _ :param sentences: Input sentences to parse :type sentence: list(list(tuple(str, str))) :rtype: iter(str) :return: a generator yielding sentences in CONLL format. """ for sentence in sentences: for input_str in taggedsent_to_conll(sentence): yield input_str yield '\n\n' ###################################################################### #{ Test Suites ###################################################################### class TestGrammar(object): """ Unit tests for CFG. """ def __init__(self, grammar, suite, accept=None, reject=None): self.test_grammar = grammar self.cp = load_parser(grammar, trace=0) self.suite = suite self._accept = accept self._reject = reject def run(self, show_trees=False): """ Sentences in the test suite are divided into two classes: - grammatical (``accept``) and - ungrammatical (``reject``). If a sentence should parse accordng to the grammar, the value of ``trees`` will be a non-empty list. If a sentence should be rejected according to the grammar, then the value of ``trees`` will be None. """ for test in self.suite: print(test['doc'] + ":", end=' ') for key in ['accept', 'reject']: for sent in test[key]: tokens = sent.split() trees = list(self.cp.parse(tokens)) if show_trees and trees: print() print(sent) for tree in trees: print(tree) if key == 'accept': if trees == []: raise ValueError("Sentence '%s' failed to parse'" % sent) else: accepted = True else: if trees: raise ValueError("Sentence '%s' received a parse'" % sent) else: rejected = True if accepted and rejected: print("All tests passed!") def extract_test_sentences(string, comment_chars="#%;", encoding=None): """ Parses a string with one test sentence per line. Lines can optionally begin with: - a bool, saying if the sentence is grammatical or not, or - an int, giving the number of parse trees is should have, The result information is followed by a colon, and then the sentence. Empty lines and lines beginning with a comment char are ignored. :return: a list of tuple of sentences and expected results, where a sentence is a list of str, and a result is None, or bool, or int :param comment_chars: ``str`` of possible comment characters. :param encoding: the encoding of the string, if it is binary """ if encoding is not None: string = string.decode(encoding) sentences = [] for sentence in string.split('\n'): if sentence == '' or sentence[0] in comment_chars: continue split_info = sentence.split(':', 1) result = None if len(split_info) == 2: if split_info[0] in ['True','true','False','false']: result = split_info[0] in ['True','true'] sentence = split_info[1] else: result = int(split_info[0]) sentence = split_info[1] tokens = sentence.split() if tokens == []: continue sentences += [(tokens, result)] return sentences # nose thinks it is a test extract_test_sentences.__test__ = False nltk-3.1/nltk/parse/viterbi.py0000644000076500000240000004072212607224144016163 0ustar sbstaff00000000000000# Natural Language Toolkit: Viterbi Probabilistic Parser # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals from functools import reduce from nltk.tree import Tree, ProbabilisticTree from nltk.compat import python_2_unicode_compatible from nltk.parse.api import ParserI ##////////////////////////////////////////////////////// ## Viterbi PCFG Parser ##////////////////////////////////////////////////////// @python_2_unicode_compatible class ViterbiParser(ParserI): """ A bottom-up ``PCFG`` parser that uses dynamic programming to find the single most likely parse for a text. The ``ViterbiParser`` parser parses texts by filling in a "most likely constituent table". This table records the most probable tree representation for any given span and node value. In particular, it has an entry for every start index, end index, and node value, recording the most likely subtree that spans from the start index to the end index, and has the given node value. The ``ViterbiParser`` parser fills in this table incrementally. It starts by filling in all entries for constituents that span one element of text (i.e., entries where the end index is one greater than the start index). After it has filled in all table entries for constituents that span one element of text, it fills in the entries for constitutants that span two elements of text. It continues filling in the entries for constituents spanning larger and larger portions of the text, until the entire table has been filled. Finally, it returns the table entry for a constituent spanning the entire text, whose node value is the grammar's start symbol. In order to find the most likely constituent with a given span and node value, the ``ViterbiParser`` parser considers all productions that could produce that node value. For each production, it finds all children that collectively cover the span and have the node values specified by the production's right hand side. If the probability of the tree formed by applying the production to the children is greater than the probability of the current entry in the table, then the table is updated with this new tree. A pseudo-code description of the algorithm used by ``ViterbiParser`` is: | Create an empty most likely constituent table, *MLC*. | For width in 1...len(text): | For start in 1...len(text)-width: | For prod in grammar.productions: | For each sequence of subtrees [t[1], t[2], ..., t[n]] in MLC, | where t[i].label()==prod.rhs[i], | and the sequence covers [start:start+width]: | old_p = MLC[start, start+width, prod.lhs] | new_p = P(t[1])P(t[1])...P(t[n])P(prod) | if new_p > old_p: | new_tree = Tree(prod.lhs, t[1], t[2], ..., t[n]) | MLC[start, start+width, prod.lhs] = new_tree | Return MLC[0, len(text), start_symbol] :type _grammar: PCFG :ivar _grammar: The grammar used to parse sentences. :type _trace: int :ivar _trace: The level of tracing output that should be generated when parsing a text. """ def __init__(self, grammar, trace=0): """ Create a new ``ViterbiParser`` parser, that uses ``grammar`` to parse texts. :type grammar: PCFG :param grammar: The grammar used to parse texts. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; and higher numbers will produce more verbose tracing output. """ self._grammar = grammar self._trace = trace def grammar(self): return self._grammar def trace(self, trace=2): """ Set the level of tracing output that should be generated when parsing a text. :type trace: int :param trace: The trace level. A trace level of ``0`` will generate no tracing output; and higher trace levels will produce more verbose tracing output. :rtype: None """ self._trace = trace def parse(self, tokens): # Inherit docs from ParserI tokens = list(tokens) self._grammar.check_coverage(tokens) # The most likely constituent table. This table specifies the # most likely constituent for a given span and type. # Constituents can be either Trees or tokens. For Trees, # the "type" is the Nonterminal for the tree's root node # value. For Tokens, the "type" is the token's type. # The table is stored as a dictionary, since it is sparse. constituents = {} # Initialize the constituents dictionary with the words from # the text. if self._trace: print(('Inserting tokens into the most likely'+ ' constituents table...')) for index in range(len(tokens)): token = tokens[index] constituents[index,index+1,token] = token if self._trace > 1: self._trace_lexical_insertion(token, index, len(tokens)) # Consider each span of length 1, 2, ..., n; and add any trees # that might cover that span to the constituents dictionary. for length in range(1, len(tokens)+1): if self._trace: print(('Finding the most likely constituents'+ ' spanning %d text elements...' % length)) for start in range(len(tokens)-length+1): span = (start, start+length) self._add_constituents_spanning(span, constituents, tokens) # Return the tree that spans the entire text & have the right cat tree = constituents.get((0, len(tokens), self._grammar.start())) if tree is not None: yield tree def _add_constituents_spanning(self, span, constituents, tokens): """ Find any constituents that might cover ``span``, and add them to the most likely constituents table. :rtype: None :type span: tuple(int, int) :param span: The section of the text for which we are trying to find possible constituents. The span is specified as a pair of integers, where the first integer is the index of the first token that should be included in the constituent; and the second integer is the index of the first token that should not be included in the constituent. I.e., the constituent should cover ``text[span[0]:span[1]]``, where ``text`` is the text that we are parsing. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. In particular, ``constituents(s,e,nv)`` is the most likely ``ProbabilisticTree`` that covers ``text[s:e]`` and has a node value ``nv.symbol()``, where ``text`` is the text that we are parsing. When ``_add_constituents_spanning`` is called, ``constituents`` should contain all possible constituents that are shorter than ``span``. :type tokens: list of tokens :param tokens: The text we are parsing. This is only used for trace output. """ # Since some of the grammar productions may be unary, we need to # repeatedly try all of the productions until none of them add any # new constituents. changed = True while changed: changed = False # Find all ways instantiations of the grammar productions that # cover the span. instantiations = self._find_instantiations(span, constituents) # For each production instantiation, add a new # ProbabilisticTree whose probability is the product # of the childrens' probabilities and the production's # probability. for (production, children) in instantiations: subtrees = [c for c in children if isinstance(c, Tree)] p = reduce(lambda pr,t:pr*t.prob(), subtrees, production.prob()) node = production.lhs().symbol() tree = ProbabilisticTree(node, children, prob=p) # If it's new a constituent, then add it to the # constituents dictionary. c = constituents.get((span[0], span[1], production.lhs())) if self._trace > 1: if c is None or c != tree: if c is None or c.prob() < tree.prob(): print(' Insert:', end=' ') else: print(' Discard:', end=' ') self._trace_production(production, p, span, len(tokens)) if c is None or c.prob() < tree.prob(): constituents[span[0], span[1], production.lhs()] = tree changed = True def _find_instantiations(self, span, constituents): """ :return: a list of the production instantiations that cover a given span of the text. A "production instantiation" is a tuple containing a production and a list of children, where the production's right hand side matches the list of children; and the children cover ``span``. :rtype: list of ``pair`` of ``Production``, (list of (``ProbabilisticTree`` or token. :type span: tuple(int, int) :param span: The section of the text for which we are trying to find production instantiations. The span is specified as a pair of integers, where the first integer is the index of the first token that should be covered by the production instantiation; and the second integer is the index of the first token that should not be covered by the production instantiation. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. See the module documentation for more information. """ rv = [] for production in self._grammar.productions(): childlists = self._match_rhs(production.rhs(), span, constituents) for childlist in childlists: rv.append( (production, childlist) ) return rv def _match_rhs(self, rhs, span, constituents): """ :return: a set of all the lists of children that cover ``span`` and that match ``rhs``. :rtype: list(list(ProbabilisticTree or token) :type rhs: list(Nonterminal or any) :param rhs: The list specifying what kinds of children need to cover ``span``. Each nonterminal in ``rhs`` specifies that the corresponding child should be a tree whose node value is that nonterminal's symbol. Each terminal in ``rhs`` specifies that the corresponding child should be a token whose type is that terminal. :type span: tuple(int, int) :param span: The section of the text for which we are trying to find child lists. The span is specified as a pair of integers, where the first integer is the index of the first token that should be covered by the child list; and the second integer is the index of the first token that should not be covered by the child list. :type constituents: dict(tuple(int,int,Nonterminal) -> ProbabilisticToken or ProbabilisticTree) :param constituents: The most likely constituents table. This table records the most probable tree representation for any given span and node value. See the module documentation for more information. """ (start, end) = span # Base case if start >= end and rhs == (): return [[]] if start >= end or rhs == (): return [] # Find everything that matches the 1st symbol of the RHS childlists = [] for split in range(start, end+1): l=constituents.get((start,split,rhs[0])) if l is not None: rights = self._match_rhs(rhs[1:], (split,end), constituents) childlists += [[l]+r for r in rights] return childlists def _trace_production(self, production, p, span, width): """ Print trace output indicating that a given production has been applied at a given location. :param production: The production that has been applied :type production: Production :param p: The probability of the tree produced by the production. :type p: float :param span: The span of the production :type span: tuple :rtype: None """ str = '|' + '.' * span[0] str += '=' * (span[1] - span[0]) str += '.' * (width - span[1]) + '| ' str += '%s' % production if self._trace > 2: str = '%-40s %12.10f ' % (str, p) print(str) def _trace_lexical_insertion(self, token, index, width): str = ' Insert: |' + '.' * index + '=' + '.' * (width-index-1) + '| ' str += '%s' % (token,) print(str) def __repr__(self): return '' % self._grammar ##////////////////////////////////////////////////////// ## Test Code ##////////////////////////////////////////////////////// def demo(): """ A demonstration of the probabilistic parsers. The user is prompted to select which demo to run, and how many parses should be found; and then each parser is run on the same demo, and a summary of the results are displayed. """ import sys, time from nltk import tokenize from nltk.parse import ViterbiParser from nltk.grammar import toy_pcfg1, toy_pcfg2 # Define two demos. Each demo has a sentence and a grammar. demos = [('I saw the man with my telescope', toy_pcfg1), ('the boy saw Jack with Bob under the table with a telescope', toy_pcfg2)] # Ask the user which demo they want to use. print() for i in range(len(demos)): print('%3s: %s' % (i+1, demos[i][0])) print(' %r' % demos[i][1]) print() print('Which demo (%d-%d)? ' % (1, len(demos)), end=' ') try: snum = int(sys.stdin.readline().strip())-1 sent, grammar = demos[snum] except: print('Bad sentence number') return # Tokenize the sentence. tokens = sent.split() parser = ViterbiParser(grammar) all_parses = {} print('\nsent: %s\nparser: %s\ngrammar: %s' % (sent,parser,grammar)) parser.trace(3) t = time.time() parses = parser.parse_all(tokens) time = time.time()-t average = (reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) if parses else 0) num_parses = len(parses) for p in parses: all_parses[p.freeze()] = 1 # Print some summary statistics print() print('Time (secs) # Parses Average P(parse)') print('-----------------------------------------') print('%11.4f%11d%19.14f' % (time, num_parses, average)) parses = all_parses.keys() if parses: p = reduce(lambda a,b:a+b.prob(), parses, 0)/len(parses) else: p = 0 print('------------------------------------------') print('%11s%11d%19.14f' % ('n/a', len(parses), p)) # Ask the user if we should draw the parses. print() print('Draw parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): from nltk.draw.tree import draw_trees print(' please wait...') draw_trees(*parses) # Ask the user if we should print the parses. print() print('Print parses (y/n)? ', end=' ') if sys.stdin.readline().strip().lower().startswith('y'): for parse in parses: print(parse) if __name__ == '__main__': demo() nltk-3.1/nltk/probability.py0000644000076500000240000024316212607224144015730 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Probability and Statistics # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (additions) # Trevor Cohn (additions) # Peter Ljunglöf (additions) # Liang Dong (additions) # Geoffrey Sampson (additions) # Ilia Kurenkov (additions) # # URL: # For license information, see LICENSE.TXT """ Classes for representing and processing probabilistic information. The ``FreqDist`` class is used to encode "frequency distributions", which count the number of times that each outcome of an experiment occurs. The ``ProbDistI`` class defines a standard interface for "probability distributions", which encode the probability of each outcome for an experiment. There are two types of probability distribution: - "derived probability distributions" are created from frequency distributions. They attempt to model the probability distribution that generated the frequency distribution. - "analytic probability distributions" are created directly from parameters (such as variance). The ``ConditionalFreqDist`` class and ``ConditionalProbDistI`` interface are used to encode conditional distributions. Conditional probability distributions can be derived or analytic; but currently the only implementation of the ``ConditionalProbDistI`` interface is ``ConditionalProbDist``, a derived distribution. """ from __future__ import print_function, unicode_literals import math import random import warnings import array from operator import itemgetter from collections import defaultdict from functools import reduce from nltk import compat from nltk.compat import Counter from nltk.internals import raise_unorderable_types _NINF = float('-1e300') ##////////////////////////////////////////////////////// ## Frequency Distributions ##////////////////////////////////////////////////////// @compat.python_2_unicode_compatible class FreqDist(Counter): """ A frequency distribution for the outcomes of an experiment. A frequency distribution records the number of times each outcome of an experiment has occurred. For example, a frequency distribution could be used to record the frequency of each word type in a document. Formally, a frequency distribution can be defined as a function mapping from each sample to the number of times that sample occurred as an outcome. Frequency distributions are generally constructed by running a number of experiments, and incrementing the count for a sample every time it is an outcome of an experiment. For example, the following code will produce a frequency distribution that encodes how often each word occurs in a text: >>> from nltk.tokenize import word_tokenize >>> from nltk.probability import FreqDist >>> sent = 'This is an example sentence' >>> fdist = FreqDist() >>> for word in word_tokenize(sent): ... fdist[word.lower()] += 1 An equivalent way to do this is with the initializer: >>> fdist = FreqDist(word.lower() for word in word_tokenize(sent)) """ def __init__(self, samples=None): """ Construct a new frequency distribution. If ``samples`` is given, then the frequency distribution will be initialized with the count of each object in ``samples``; otherwise, it will be initialized to be empty. In particular, ``FreqDist()`` returns an empty frequency distribution; and ``FreqDist(samples)`` first creates an empty frequency distribution, and then calls ``update`` with the list ``samples``. :param samples: The samples to initialize the frequency distribution with. :type samples: Sequence """ Counter.__init__(self, samples) def N(self): """ Return the total number of sample outcomes that have been recorded by this FreqDist. For the number of unique sample values (or bins) with counts greater than zero, use ``FreqDist.B()``. :rtype: int """ return sum(self.values()) def B(self): """ Return the total number of sample values (or "bins") that have counts greater than zero. For the total number of sample outcomes recorded, use ``FreqDist.N()``. (FreqDist.B() is the same as len(FreqDist).) :rtype: int """ return len(self) def hapaxes(self): """ Return a list of all samples that occur once (hapax legomena) :rtype: list """ return [item for item in self if self[item] == 1] def Nr(self, r, bins=None): return self.r_Nr(bins)[r] def r_Nr(self, bins=None): """ Return the dictionary mapping r to Nr, the number of samples with frequency r, where Nr > 0. :type bins: int :param bins: The number of possible sample outcomes. ``bins`` is used to calculate Nr(0). In particular, Nr(0) is ``bins-self.B()``. If ``bins`` is not specified, it defaults to ``self.B()`` (so Nr(0) will be 0). :rtype: int """ _r_Nr = defaultdict(int) for count in self.values(): _r_Nr[count] += 1 # Special case for Nr[0]: _r_Nr[0] = bins - self.B() if bins is not None else 0 return _r_Nr def _cumulative_frequencies(self, samples): """ Return the cumulative frequencies of the specified samples. If no samples are specified, all counts are returned, starting with the largest. :param samples: the samples whose frequencies should be returned. :type samples: any :rtype: list(float) """ cf = 0.0 for sample in samples: cf += self[sample] yield cf # slightly odd nomenclature freq() if FreqDist does counts and ProbDist does probs, # here, freq() does probs def freq(self, sample): """ Return the frequency of a given sample. The frequency of a sample is defined as the count of that sample divided by the total number of sample outcomes that have been recorded by this FreqDist. The count of a sample is defined as the number of times that sample outcome was recorded by this FreqDist. Frequencies are always real numbers in the range [0, 1]. :param sample: the sample whose frequency should be returned. :type sample: any :rtype: float """ if self.N() == 0: return 0 return float(self[sample]) / self.N() def max(self): """ Return the sample with the greatest number of outcomes in this frequency distribution. If two or more samples have the same number of outcomes, return one of them; which sample is returned is undefined. If no outcomes have occurred in this frequency distribution, return None. :return: The sample with the maximum number of outcomes in this frequency distribution. :rtype: any or None """ if len(self) == 0: raise ValueError('A FreqDist must have at least one sample before max is defined.') return self.most_common(1)[0][0] def plot(self, *args, **kwargs): """ Plot samples from the frequency distribution displaying the most frequent sample first. If an integer parameter is supplied, stop after this many samples have been plotted. For a cumulative plot, specify cumulative=True. (Requires Matplotlib to be installed.) :param title: The title for the graph :type title: str :param cumulative: A flag to specify whether the plot is cumulative (default = False) :type title: bool """ try: from matplotlib import pylab except ImportError: raise ValueError('The plot function requires matplotlib to be installed.' 'See http://matplotlib.org/') if len(args) == 0: args = [len(self)] samples = [item for item, _ in self.most_common(*args)] cumulative = _get_kwarg(kwargs, 'cumulative', False) if cumulative: freqs = list(self._cumulative_frequencies(samples)) ylabel = "Cumulative Counts" else: freqs = [self[sample] for sample in samples] ylabel = "Counts" # percents = [f * 100 for f in freqs] only in ProbDist? pylab.grid(True, color="silver") if not "linewidth" in kwargs: kwargs["linewidth"] = 2 if "title" in kwargs: pylab.title(kwargs["title"]) del kwargs["title"] pylab.plot(freqs, **kwargs) pylab.xticks(range(len(samples)), [compat.text_type(s) for s in samples], rotation=90) pylab.xlabel("Samples") pylab.ylabel(ylabel) pylab.show() def tabulate(self, *args, **kwargs): """ Tabulate the given samples from the frequency distribution (cumulative), displaying the most frequent sample first. If an integer parameter is supplied, stop after this many samples have been plotted. :param samples: The samples to plot (default is all samples) :type samples: list """ if len(args) == 0: args = [len(self)] samples = [item for item, _ in self.most_common(*args)] cumulative = _get_kwarg(kwargs, 'cumulative', False) if cumulative: freqs = list(self._cumulative_frequencies(samples)) else: freqs = [self[sample] for sample in samples] # percents = [f * 100 for f in freqs] only in ProbDist? for i in range(len(samples)): print("%4s" % samples[i], end=' ') print() for i in range(len(samples)): print("%4d" % freqs[i], end=' ') print() def copy(self): """ Create a copy of this frequency distribution. :rtype: FreqDist """ return self.__class__(self) def __le__(self, other): if not isinstance(other, FreqDist): raise_unorderable_types("<=", self, other) return set(self).issubset(other) and all(self[key] <= other[key] for key in self) # @total_ordering doesn't work here, since the class inherits from a builtin class __ge__ = lambda self, other: not self <= other or self == other __lt__ = lambda self, other: self <= other and not self == other __gt__ = lambda self, other: not self <= other def __repr__(self): """ Return a string representation of this FreqDist. :rtype: string """ return self.pformat() def pprint(self, maxlen=10, stream=None): """ Print a string representation of this FreqDist to 'stream' :param maxlen: The maximum number of items to print :type maxlen: int :param stream: The stream to print to. stdout by default """ print(self.pformat(maxlen=maxlen), file=stream) def pformat(self, maxlen=10): """ Return a string representation of this FreqDist. :param maxlen: The maximum number of items to display :type maxlen: int :rtype: string """ items = ['{0!r}: {1!r}'.format(*item) for item in self.most_common(maxlen)] if len(self) > maxlen: items.append('...') return 'FreqDist({{{0}}})'.format(', '.join(items)) def __str__(self): """ Return a string representation of this FreqDist. :rtype: string """ return '' % (len(self), self.N()) ##////////////////////////////////////////////////////// ## Probability Distributions ##////////////////////////////////////////////////////// class ProbDistI(object): """ A probability distribution for the outcomes of an experiment. A probability distribution specifies how likely it is that an experiment will have any given outcome. For example, a probability distribution could be used to predict the probability that a token in a document will have a given type. Formally, a probability distribution can be defined as a function mapping from samples to nonnegative real numbers, such that the sum of every number in the function's range is 1.0. A ``ProbDist`` is often used to model the probability distribution of the experiment used to generate a frequency distribution. """ SUM_TO_ONE = True """True if the probabilities of the samples in this probability distribution will always sum to one.""" def __init__(self): if self.__class__ == ProbDistI: raise NotImplementedError("Interfaces can't be instantiated") def prob(self, sample): """ Return the probability for a given sample. Probabilities are always real numbers in the range [0, 1]. :param sample: The sample whose probability should be returned. :type sample: any :rtype: float """ raise NotImplementedError() def logprob(self, sample): """ Return the base 2 logarithm of the probability for a given sample. :param sample: The sample whose probability should be returned. :type sample: any :rtype: float """ # Default definition, in terms of prob() p = self.prob(sample) return (math.log(p, 2) if p != 0 else _NINF) def max(self): """ Return the sample with the greatest probability. If two or more samples have the same probability, return one of them; which sample is returned is undefined. :rtype: any """ raise NotImplementedError() def samples(self): """ Return a list of all samples that have nonzero probabilities. Use ``prob`` to find the probability of each sample. :rtype: list """ raise NotImplementedError() # cf self.SUM_TO_ONE def discount(self): """ Return the ratio by which counts are discounted on average: c*/c :rtype: float """ return 0.0 # Subclasses should define more efficient implementations of this, # where possible. def generate(self): """ Return a randomly selected sample from this probability distribution. The probability of returning each sample ``samp`` is equal to ``self.prob(samp)``. """ p = random.random() p_init = p for sample in self.samples(): p -= self.prob(sample) if p <= 0: return sample # allow for some rounding error: if p < .0001: return sample # we *should* never get here if self.SUM_TO_ONE: warnings.warn("Probability distribution %r sums to %r; generate()" " is returning an arbitrary sample." % (self, p_init-p)) return random.choice(list(self.samples())) @compat.python_2_unicode_compatible class UniformProbDist(ProbDistI): """ A probability distribution that assigns equal probability to each sample in a given set; and a zero probability to all other samples. """ def __init__(self, samples): """ Construct a new uniform probability distribution, that assigns equal probability to each sample in ``samples``. :param samples: The samples that should be given uniform probability. :type samples: list :raise ValueError: If ``samples`` is empty. """ if len(samples) == 0: raise ValueError('A Uniform probability distribution must '+ 'have at least one sample.') self._sampleset = set(samples) self._prob = 1.0/len(self._sampleset) self._samples = list(self._sampleset) def prob(self, sample): return (self._prob if sample in self._sampleset else 0) def max(self): return self._samples[0] def samples(self): return self._samples def __repr__(self): return '' % len(self._sampleset) @compat.python_2_unicode_compatible class RandomProbDist(ProbDistI): """ Generates a random probability distribution whereby each sample will be between 0 and 1 with equal probability (uniform random distribution. Also called a continuous uniform distribution). """ def __init__(self, samples): if len(samples) == 0: raise ValueError('A probability distribution must '+ 'have at least one sample.') self._probs = self.unirand(samples) self._samples = list(self._probs.keys()) @classmethod def unirand(cls, samples): """ The key function that creates a randomized initial distribution that still sums to 1. Set as a dictionary of prob values so that it can still be passed to MutableProbDist and called with identical syntax to UniformProbDist """ randrow = [random.random() for i in range(len(samples))] total = sum(randrow) for i, x in enumerate(randrow): randrow[i] = x/total total = sum(randrow) if total != 1: #this difference, if present, is so small (near NINF) that it #can be subtracted from any element without risking probs not (0 1) randrow[-1] -= total - 1 return dict((s, randrow[i]) for i, s in enumerate(samples)) def prob(self, sample): return self._probs.get(sample, 0) def samples(self): return self._samples def __repr__(self): return '' %len(self._probs) @compat.python_2_unicode_compatible class DictionaryProbDist(ProbDistI): """ A probability distribution whose probabilities are directly specified by a given dictionary. The given dictionary maps samples to probabilities. """ def __init__(self, prob_dict=None, log=False, normalize=False): """ Construct a new probability distribution from the given dictionary, which maps values to probabilities (or to log probabilities, if ``log`` is true). If ``normalize`` is true, then the probability values are scaled by a constant factor such that they sum to 1. If called without arguments, the resulting probability distribution assigns zero probability to all values. """ self._prob_dict = (prob_dict.copy() if prob_dict is not None else {}) self._log = log # Normalize the distribution, if requested. if normalize: if len(prob_dict) == 0: raise ValueError('A DictionaryProbDist must have at least one sample ' + 'before it can be normalized.') if log: value_sum = sum_logs(list(self._prob_dict.values())) if value_sum <= _NINF: logp = math.log(1.0/len(prob_dict), 2) for x in prob_dict: self._prob_dict[x] = logp else: for (x, p) in self._prob_dict.items(): self._prob_dict[x] -= value_sum else: value_sum = sum(self._prob_dict.values()) if value_sum == 0: p = 1.0/len(prob_dict) for x in prob_dict: self._prob_dict[x] = p else: norm_factor = 1.0/value_sum for (x, p) in self._prob_dict.items(): self._prob_dict[x] *= norm_factor def prob(self, sample): if self._log: return (2**(self._prob_dict[sample]) if sample in self._prob_dict else 0) else: return self._prob_dict.get(sample, 0) def logprob(self, sample): if self._log: return self._prob_dict.get(sample, _NINF) else: if sample not in self._prob_dict: return _NINF elif self._prob_dict[sample] == 0: return _NINF else: return math.log(self._prob_dict[sample], 2) def max(self): if not hasattr(self, '_max'): self._max = max((p,v) for (v,p) in self._prob_dict.items())[1] return self._max def samples(self): return self._prob_dict.keys() def __repr__(self): return '' % len(self._prob_dict) @compat.python_2_unicode_compatible class MLEProbDist(ProbDistI): """ The maximum likelihood estimate for the probability distribution of the experiment used to generate a frequency distribution. The "maximum likelihood estimate" approximates the probability of each sample as the frequency of that sample in the frequency distribution. """ def __init__(self, freqdist, bins=None): """ Use the maximum likelihood estimate to create a probability distribution for the experiment used to generate ``freqdist``. :type freqdist: FreqDist :param freqdist: The frequency distribution that the probability estimates should be based on. """ self._freqdist = freqdist def freqdist(self): """ Return the frequency distribution that this probability distribution is based on. :rtype: FreqDist """ return self._freqdist def prob(self, sample): return self._freqdist.freq(sample) def max(self): return self._freqdist.max() def samples(self): return self._freqdist.keys() def __repr__(self): """ :rtype: str :return: A string representation of this ``ProbDist``. """ return '' % self._freqdist.N() @compat.python_2_unicode_compatible class LidstoneProbDist(ProbDistI): """ The Lidstone estimate for the probability distribution of the experiment used to generate a frequency distribution. The "Lidstone estimate" is parameterized by a real number *gamma*, which typically ranges from 0 to 1. The Lidstone estimate approximates the probability of a sample with count *c* from an experiment with *N* outcomes and *B* bins as ``c+gamma)/(N+B*gamma)``. This is equivalent to adding *gamma* to the count for each bin, and taking the maximum likelihood estimate of the resulting frequency distribution. """ SUM_TO_ONE = False def __init__(self, freqdist, gamma, bins=None): """ Use the Lidstone estimate to create a probability distribution for the experiment used to generate ``freqdist``. :type freqdist: FreqDist :param freqdist: The frequency distribution that the probability estimates should be based on. :type gamma: float :param gamma: A real number used to parameterize the estimate. The Lidstone estimate is equivalent to adding *gamma* to the count for each bin, and taking the maximum likelihood estimate of the resulting frequency distribution. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ if (bins == 0) or (bins is None and freqdist.N() == 0): name = self.__class__.__name__[:-8] raise ValueError('A %s probability distribution ' % name + 'must have at least one bin.') if (bins is not None) and (bins < freqdist.B()): name = self.__class__.__name__[:-8] raise ValueError('\nThe number of bins in a %s distribution ' % name + '(%d) must be greater than or equal to\n' % bins + 'the number of bins in the FreqDist used ' + 'to create it (%d).' % freqdist.B()) self._freqdist = freqdist self._gamma = float(gamma) self._N = self._freqdist.N() if bins is None: bins = freqdist.B() self._bins = bins self._divisor = self._N + bins * gamma if self._divisor == 0.0: # In extreme cases we force the probability to be 0, # which it will be, since the count will be 0: self._gamma = 0 self._divisor = 1 def freqdist(self): """ Return the frequency distribution that this probability distribution is based on. :rtype: FreqDist """ return self._freqdist def prob(self, sample): c = self._freqdist[sample] return (c + self._gamma) / self._divisor def max(self): # For Lidstone distributions, probability is monotonic with # frequency, so the most probable sample is the one that # occurs most frequently. return self._freqdist.max() def samples(self): return self._freqdist.keys() def discount(self): gb = self._gamma * self._bins return gb / (self._N + gb) def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return '' % self._freqdist.N() @compat.python_2_unicode_compatible class LaplaceProbDist(LidstoneProbDist): """ The Laplace estimate for the probability distribution of the experiment used to generate a frequency distribution. The "Laplace estimate" approximates the probability of a sample with count *c* from an experiment with *N* outcomes and *B* bins as *(c+1)/(N+B)*. This is equivalent to adding one to the count for each bin, and taking the maximum likelihood estimate of the resulting frequency distribution. """ def __init__(self, freqdist, bins=None): """ Use the Laplace estimate to create a probability distribution for the experiment used to generate ``freqdist``. :type freqdist: FreqDist :param freqdist: The frequency distribution that the probability estimates should be based on. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ LidstoneProbDist.__init__(self, freqdist, 1, bins) def __repr__(self): """ :rtype: str :return: A string representation of this ``ProbDist``. """ return '' % self._freqdist.N() @compat.python_2_unicode_compatible class ELEProbDist(LidstoneProbDist): """ The expected likelihood estimate for the probability distribution of the experiment used to generate a frequency distribution. The "expected likelihood estimate" approximates the probability of a sample with count *c* from an experiment with *N* outcomes and *B* bins as *(c+0.5)/(N+B/2)*. This is equivalent to adding 0.5 to the count for each bin, and taking the maximum likelihood estimate of the resulting frequency distribution. """ def __init__(self, freqdist, bins=None): """ Use the expected likelihood estimate to create a probability distribution for the experiment used to generate ``freqdist``. :type freqdist: FreqDist :param freqdist: The frequency distribution that the probability estimates should be based on. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ LidstoneProbDist.__init__(self, freqdist, 0.5, bins) def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return '' % self._freqdist.N() @compat.python_2_unicode_compatible class HeldoutProbDist(ProbDistI): """ The heldout estimate for the probability distribution of the experiment used to generate two frequency distributions. These two frequency distributions are called the "heldout frequency distribution" and the "base frequency distribution." The "heldout estimate" uses uses the "heldout frequency distribution" to predict the probability of each sample, given its frequency in the "base frequency distribution". In particular, the heldout estimate approximates the probability for a sample that occurs *r* times in the base distribution as the average frequency in the heldout distribution of all samples that occur *r* times in the base distribution. This average frequency is *Tr[r]/(Nr[r].N)*, where: - *Tr[r]* is the total count in the heldout distribution for all samples that occur *r* times in the base distribution. - *Nr[r]* is the number of samples that occur *r* times in the base distribution. - *N* is the number of outcomes recorded by the heldout frequency distribution. In order to increase the efficiency of the ``prob`` member function, *Tr[r]/(Nr[r].N)* is precomputed for each value of *r* when the ``HeldoutProbDist`` is created. :type _estimate: list(float) :ivar _estimate: A list mapping from *r*, the number of times that a sample occurs in the base distribution, to the probability estimate for that sample. ``_estimate[r]`` is calculated by finding the average frequency in the heldout distribution of all samples that occur *r* times in the base distribution. In particular, ``_estimate[r]`` = *Tr[r]/(Nr[r].N)*. :type _max_r: int :ivar _max_r: The maximum number of times that any sample occurs in the base distribution. ``_max_r`` is used to decide how large ``_estimate`` must be. """ SUM_TO_ONE = False def __init__(self, base_fdist, heldout_fdist, bins=None): """ Use the heldout estimate to create a probability distribution for the experiment used to generate ``base_fdist`` and ``heldout_fdist``. :type base_fdist: FreqDist :param base_fdist: The base frequency distribution. :type heldout_fdist: FreqDist :param heldout_fdist: The heldout frequency distribution. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ self._base_fdist = base_fdist self._heldout_fdist = heldout_fdist # The max number of times any sample occurs in base_fdist. self._max_r = base_fdist[base_fdist.max()] # Calculate Tr, Nr, and N. Tr = self._calculate_Tr() r_Nr = base_fdist.r_Nr(bins) Nr = [r_Nr[r] for r in range(self._max_r+1)] N = heldout_fdist.N() # Use Tr, Nr, and N to compute the probability estimate for # each value of r. self._estimate = self._calculate_estimate(Tr, Nr, N) def _calculate_Tr(self): """ Return the list *Tr*, where *Tr[r]* is the total count in ``heldout_fdist`` for all samples that occur *r* times in ``base_fdist``. :rtype: list(float) """ Tr = [0.0] * (self._max_r+1) for sample in self._heldout_fdist: r = self._base_fdist[sample] Tr[r] += self._heldout_fdist[sample] return Tr def _calculate_estimate(self, Tr, Nr, N): """ Return the list *estimate*, where *estimate[r]* is the probability estimate for any sample that occurs *r* times in the base frequency distribution. In particular, *estimate[r]* is *Tr[r]/(N[r].N)*. In the special case that *N[r]=0*, *estimate[r]* will never be used; so we define *estimate[r]=None* for those cases. :rtype: list(float) :type Tr: list(float) :param Tr: the list *Tr*, where *Tr[r]* is the total count in the heldout distribution for all samples that occur *r* times in base distribution. :type Nr: list(float) :param Nr: The list *Nr*, where *Nr[r]* is the number of samples that occur *r* times in the base distribution. :type N: int :param N: The total number of outcomes recorded by the heldout frequency distribution. """ estimate = [] for r in range(self._max_r+1): if Nr[r] == 0: estimate.append(None) else: estimate.append(Tr[r]/(Nr[r]*N)) return estimate def base_fdist(self): """ Return the base frequency distribution that this probability distribution is based on. :rtype: FreqDist """ return self._base_fdist def heldout_fdist(self): """ Return the heldout frequency distribution that this probability distribution is based on. :rtype: FreqDist """ return self._heldout_fdist def samples(self): return self._base_fdist.keys() def prob(self, sample): # Use our precomputed probability estimate. r = self._base_fdist[sample] return self._estimate[r] def max(self): # Note: the Heldout estimation is *not* necessarily monotonic; # so this implementation is currently broken. However, it # should give the right answer *most* of the time. :) return self._base_fdist.max() def discount(self): raise NotImplementedError() def __repr__(self): """ :rtype: str :return: A string representation of this ``ProbDist``. """ s = '' return s % (self._base_fdist.N(), self._heldout_fdist.N()) @compat.python_2_unicode_compatible class CrossValidationProbDist(ProbDistI): """ The cross-validation estimate for the probability distribution of the experiment used to generate a set of frequency distribution. The "cross-validation estimate" for the probability of a sample is found by averaging the held-out estimates for the sample in each pair of frequency distributions. """ SUM_TO_ONE = False def __init__(self, freqdists, bins): """ Use the cross-validation estimate to create a probability distribution for the experiment used to generate ``freqdists``. :type freqdists: list(FreqDist) :param freqdists: A list of the frequency distributions generated by the experiment. :type bins: int :param bins: The number of sample values that can be generated by the experiment that is described by the probability distribution. This value must be correctly set for the probabilities of the sample values to sum to one. If ``bins`` is not specified, it defaults to ``freqdist.B()``. """ self._freqdists = freqdists # Create a heldout probability distribution for each pair of # frequency distributions in freqdists. self._heldout_probdists = [] for fdist1 in freqdists: for fdist2 in freqdists: if fdist1 is not fdist2: probdist = HeldoutProbDist(fdist1, fdist2, bins) self._heldout_probdists.append(probdist) def freqdists(self): """ Return the list of frequency distributions that this ``ProbDist`` is based on. :rtype: list(FreqDist) """ return self._freqdists def samples(self): # [xx] nb: this is not too efficient return set(sum([list(fd) for fd in self._freqdists], [])) def prob(self, sample): # Find the average probability estimate returned by each # heldout distribution. prob = 0.0 for heldout_probdist in self._heldout_probdists: prob += heldout_probdist.prob(sample) return prob/len(self._heldout_probdists) def discount(self): raise NotImplementedError() def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return '' % len(self._freqdists) @compat.python_2_unicode_compatible class WittenBellProbDist(ProbDistI): """ The Witten-Bell estimate of a probability distribution. This distribution allocates uniform probability mass to as yet unseen events by using the number of events that have only been seen once. The probability mass reserved for unseen events is equal to *T / (N + T)* where *T* is the number of observed event types and *N* is the total number of observed events. This equates to the maximum likelihood estimate of a new type event occurring. The remaining probability mass is discounted such that all probability estimates sum to one, yielding: - *p = T / Z (N + T)*, if count = 0 - *p = c / (N + T)*, otherwise """ def __init__(self, freqdist, bins=None): """ Creates a distribution of Witten-Bell probability estimates. This distribution allocates uniform probability mass to as yet unseen events by using the number of events that have only been seen once. The probability mass reserved for unseen events is equal to *T / (N + T)* where *T* is the number of observed event types and *N* is the total number of observed events. This equates to the maximum likelihood estimate of a new type event occurring. The remaining probability mass is discounted such that all probability estimates sum to one, yielding: - *p = T / Z (N + T)*, if count = 0 - *p = c / (N + T)*, otherwise The parameters *T* and *N* are taken from the ``freqdist`` parameter (the ``B()`` and ``N()`` values). The normalizing factor *Z* is calculated using these values along with the ``bins`` parameter. :param freqdist: The frequency counts upon which to base the estimation. :type freqdist: FreqDist :param bins: The number of possible event types. This must be at least as large as the number of bins in the ``freqdist``. If None, then it's assumed to be equal to that of the ``freqdist`` :type bins: int """ assert bins is None or bins >= freqdist.B(),\ 'bins parameter must not be less than %d=freqdist.B()' % freqdist.B() if bins is None: bins = freqdist.B() self._freqdist = freqdist self._T = self._freqdist.B() self._Z = bins - self._freqdist.B() self._N = self._freqdist.N() # self._P0 is P(0), precalculated for efficiency: if self._N==0: # if freqdist is empty, we approximate P(0) by a UniformProbDist: self._P0 = 1.0 / self._Z else: self._P0 = self._T / float(self._Z * (self._N + self._T)) def prob(self, sample): # inherit docs from ProbDistI c = self._freqdist[sample] return (c / float(self._N + self._T) if c != 0 else self._P0) def max(self): return self._freqdist.max() def samples(self): return self._freqdist.keys() def freqdist(self): return self._freqdist def discount(self): raise NotImplementedError() def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return '' % self._freqdist.N() ##////////////////////////////////////////////////////// ## Good-Turing Probability Distributions ##////////////////////////////////////////////////////// # Good-Turing frequency estimation was contributed by Alan Turing and # his statistical assistant I.J. Good, during their collaboration in # the WWII. It is a statistical technique for predicting the # probability of occurrence of objects belonging to an unknown number # of species, given past observations of such objects and their # species. (In drawing balls from an urn, the 'objects' would be balls # and the 'species' would be the distinct colors of the balls (finite # but unknown in number). # # Good-Turing method calculates the probability mass to assign to # events with zero or low counts based on the number of events with # higher counts. It does so by using the adjusted count *c\**: # # - *c\* = (c + 1) N(c + 1) / N(c)* for c >= 1 # - *things with frequency zero in training* = N(1) for c == 0 # # where *c* is the original count, *N(i)* is the number of event types # observed with count *i*. We can think the count of unseen as the count # of frequency one (see Jurafsky & Martin 2nd Edition, p101). # # This method is problematic because the situation ``N(c+1) == 0`` # is quite common in the original Good-Turing estimation; smoothing or # interpolation of *N(i)* values is essential in practice. # # Bill Gale and Geoffrey Sampson present a simple and effective approach, # Simple Good-Turing. As a smoothing curve they simply use a power curve: # # Nr = a*r^b (with b < -1 to give the appropriate hyperbolic # relationship) # # They estimate a and b by simple linear regression technique on the # logarithmic form of the equation: # # log Nr = a + b*log(r) # # However, they suggest that such a simple curve is probably only # appropriate for high values of r. For low values of r, they use the # measured Nr directly. (see M&S, p.213) # # Gale and Sampson propose to use r while the difference between r and # r* is 1.96 greater than the standard deviation, and switch to r* if # it is less or equal: # # |r - r*| > 1.96 * sqrt((r + 1)^2 (Nr+1 / Nr^2) (1 + Nr+1 / Nr)) # # The 1.96 coefficient correspond to a 0.05 significance criterion, # some implementations can use a coefficient of 1.65 for a 0.1 # significance criterion. # ##////////////////////////////////////////////////////// ## Simple Good-Turing Probablity Distributions ##////////////////////////////////////////////////////// @compat.python_2_unicode_compatible class SimpleGoodTuringProbDist(ProbDistI): """ SimpleGoodTuring ProbDist approximates from frequency to frequency of frequency into a linear line under log space by linear regression. Details of Simple Good-Turing algorithm can be found in: - Good Turing smoothing without tears" (Gale & Sampson 1995), Journal of Quantitative Linguistics, vol. 2 pp. 217-237. - "Speech and Language Processing (Jurafsky & Martin), 2nd Edition, Chapter 4.5 p103 (log(Nc) = a + b*log(c)) - http://www.grsampson.net/RGoodTur.html Given a set of pair (xi, yi), where the xi denotes the frequency and yi denotes the frequency of frequency, we want to minimize their square variation. E(x) and E(y) represent the mean of xi and yi. - slope: b = sigma ((xi-E(x)(yi-E(y))) / sigma ((xi-E(x))(xi-E(x))) - intercept: a = E(y) - b.E(x) """ SUM_TO_ONE = False def __init__(self, freqdist, bins=None): """ :param freqdist: The frequency counts upon which to base the estimation. :type freqdist: FreqDist :param bins: The number of possible event types. This must be larger than the number of bins in the ``freqdist``. If None, then it's assumed to be equal to ``freqdist``.B() + 1 :type bins: int """ assert bins is None or bins > freqdist.B(),\ 'bins parameter must not be less than %d=freqdist.B()+1' % (freqdist.B()+1) if bins is None: bins = freqdist.B() + 1 self._freqdist = freqdist self._bins = bins r, nr = self._r_Nr() self.find_best_fit(r, nr) self._switch(r, nr) self._renormalize(r, nr) def _r_Nr_non_zero(self): r_Nr = self._freqdist.r_Nr() del r_Nr[0] return r_Nr def _r_Nr(self): """ Split the frequency distribution in two list (r, Nr), where Nr(r) > 0 """ nonzero = self._r_Nr_non_zero() if not nonzero: return [], [] return zip(*sorted(nonzero.items())) def find_best_fit(self, r, nr): """ Use simple linear regression to tune parameters self._slope and self._intercept in the log-log space based on count and Nr(count) (Work in log space to avoid floating point underflow.) """ # For higher sample frequencies the data points becomes horizontal # along line Nr=1. To create a more evident linear model in log-log # space, we average positive Nr values with the surrounding zero # values. (Church and Gale, 1991) if not r or not nr: # Empty r or nr? return zr = [] for j in range(len(r)): i = (r[j-1] if j > 0 else 0) k = (2 * r[j] - i if j == len(r) - 1 else r[j+1]) zr_ = 2.0 * nr[j] / (k - i) zr.append(zr_) log_r = [math.log(i) for i in r] log_zr = [math.log(i) for i in zr] xy_cov = x_var = 0.0 x_mean = 1.0 * sum(log_r) / len(log_r) y_mean = 1.0 * sum(log_zr) / len(log_zr) for (x, y) in zip(log_r, log_zr): xy_cov += (x - x_mean) * (y - y_mean) x_var += (x - x_mean)**2 self._slope = (xy_cov / x_var if x_var != 0 else 0.0) if self._slope >= -1: warnings.warn('SimpleGoodTuring did not find a proper best fit ' 'line for smoothing probabilities of occurrences. ' 'The probability estimates are likely to be ' 'unreliable.') self._intercept = y_mean - self._slope * x_mean def _switch(self, r, nr): """ Calculate the r frontier where we must switch from Nr to Sr when estimating E[Nr]. """ for i, r_ in enumerate(r): if len(r) == i + 1 or r[i+1] != r_ + 1: # We are at the end of r, or there is a gap in r self._switch_at = r_ break Sr = self.smoothedNr smooth_r_star = (r_ + 1) * Sr(r_+1) / Sr(r_) unsmooth_r_star = 1.0 * (r_ + 1) * nr[i+1] / nr[i] std = math.sqrt(self._variance(r_, nr[i], nr[i+1])) if abs(unsmooth_r_star-smooth_r_star) <= 1.96 * std: self._switch_at = r_ break def _variance(self, r, nr, nr_1): r = float(r) nr = float(nr) nr_1 = float(nr_1) return (r + 1.0)**2 * (nr_1 / nr**2) * (1.0 + nr_1 / nr) def _renormalize(self, r, nr): """ It is necessary to renormalize all the probability estimates to ensure a proper probability distribution results. This can be done by keeping the estimate of the probability mass for unseen items as N(1)/N and renormalizing all the estimates for previously seen items (as Gale and Sampson (1995) propose). (See M&S P.213, 1999) """ prob_cov = 0.0 for r_, nr_ in zip(r, nr): prob_cov += nr_ * self._prob_measure(r_) if prob_cov: self._renormal = (1 - self._prob_measure(0)) / prob_cov def smoothedNr(self, r): """ Return the number of samples with count r. :param r: The amount of frequency. :type r: int :rtype: float """ # Nr = a*r^b (with b < -1 to give the appropriate hyperbolic # relationship) # Estimate a and b by simple linear regression technique on # the logarithmic form of the equation: log Nr = a + b*log(r) return math.exp(self._intercept + self._slope * math.log(r)) def prob(self, sample): """ Return the sample's probability. :param sample: sample of the event :type sample: str :rtype: float """ count = self._freqdist[sample] p = self._prob_measure(count) if count == 0: if self._bins == self._freqdist.B(): p = 0.0 else: p = p / (1.0 * self._bins - self._freqdist.B()) else: p = p * self._renormal return p def _prob_measure(self, count): if count == 0 and self._freqdist.N() == 0 : return 1.0 elif count == 0 and self._freqdist.N() != 0: return 1.0 * self._freqdist.Nr(1) / self._freqdist.N() if self._switch_at > count: Er_1 = 1.0 * self._freqdist.Nr(count+1) Er = 1.0 * self._freqdist.Nr(count) else: Er_1 = self.smoothedNr(count+1) Er = self.smoothedNr(count) r_star = (count + 1) * Er_1 / Er return r_star / self._freqdist.N() def check(self): prob_sum = 0.0 for i in range(0, len(self._Nr)): prob_sum += self._Nr[i] * self._prob_measure(i) / self._renormal print("Probability Sum:", prob_sum) #assert prob_sum != 1.0, "probability sum should be one!" def discount(self): """ This function returns the total mass of probability transfers from the seen samples to the unseen samples. """ return 1.0 * self.smoothedNr(1) / self._freqdist.N() def max(self): return self._freqdist.max() def samples(self): return self._freqdist.keys() def freqdist(self): return self._freqdist def __repr__(self): """ Return a string representation of this ``ProbDist``. :rtype: str """ return ''\ % self._freqdist.N() class MutableProbDist(ProbDistI): """ An mutable probdist where the probabilities may be easily modified. This simply copies an existing probdist, storing the probability values in a mutable dictionary and providing an update method. """ def __init__(self, prob_dist, samples, store_logs=True): """ Creates the mutable probdist based on the given prob_dist and using the list of samples given. These values are stored as log probabilities if the store_logs flag is set. :param prob_dist: the distribution from which to garner the probabilities :type prob_dist: ProbDist :param samples: the complete set of samples :type samples: sequence of any :param store_logs: whether to store the probabilities as logarithms :type store_logs: bool """ self._samples = samples self._sample_dict = dict((samples[i], i) for i in range(len(samples))) self._data = array.array(str("d"), [0.0]) * len(samples) for i in range(len(samples)): if store_logs: self._data[i] = prob_dist.logprob(samples[i]) else: self._data[i] = prob_dist.prob(samples[i]) self._logs = store_logs def samples(self): # inherit documentation return self._samples def prob(self, sample): # inherit documentation i = self._sample_dict.get(sample) if i is None: return 0.0 return (2**(self._data[i]) if self._logs else self._data[i]) def logprob(self, sample): # inherit documentation i = self._sample_dict.get(sample) if i is None: return float('-inf') return (self._data[i] if self._logs else math.log(self._data[i], 2)) def update(self, sample, prob, log=True): """ Update the probability for the given sample. This may cause the object to stop being the valid probability distribution - the user must ensure that they update the sample probabilities such that all samples have probabilities between 0 and 1 and that all probabilities sum to one. :param sample: the sample for which to update the probability :type sample: any :param prob: the new probability :type prob: float :param log: is the probability already logged :type log: bool """ i = self._sample_dict.get(sample) assert i is not None if self._logs: self._data[i] = (prob if log else math.log(prob, 2)) else: self._data[i] = (2**(prob) if log else prob) ##///////////////////////////////////////////////////// ## Kneser-Ney Probability Distribution ##////////////////////////////////////////////////////// # This method for calculating probabilities was introduced in 1995 by Reinhard # Kneser and Hermann Ney. It was meant to improve the accuracy of language # models that use backing-off to deal with sparse data. The authors propose two # ways of doing so: a marginal distribution constraint on the back-off # distribution and a leave-one-out distribution. For a start, the first one is # implemented as a class below. # # The idea behind a back-off n-gram model is that we have a series of # frequency distributions for our n-grams so that in case we have not seen a # given n-gram during training (and as a result have a 0 probability for it) we # can 'back off' (hence the name!) and try testing whether we've seen the # n-1-gram part of the n-gram in training. # # The novelty of Kneser and Ney's approach was that they decided to fiddle # around with the way this latter, backed off probability was being calculated # whereas their peers seemed to focus on the primary probability. # # The implementation below uses one of the techniques described in their paper # titled "Improved backing-off for n-gram language modeling." In the same paper # another technique is introduced to attempt to smooth the back-off # distribution as well as the primary one. There is also a much-cited # modification of this method proposed by Chen and Goodman. # # In order for the implementation of Kneser-Ney to be more efficient, some # changes have been made to the original algorithm. Namely, the calculation of # the normalizing function gamma has been significantly simplified and # combined slightly differently with beta. None of these changes affect the # nature of the algorithm, but instead aim to cut out unnecessary calculations # and take advantage of storing and retrieving information in dictionaries # where possible. @compat.python_2_unicode_compatible class KneserNeyProbDist(ProbDistI): """ Kneser-Ney estimate of a probability distribution. This is a version of back-off that counts how likely an n-gram is provided the n-1-gram had been seen in training. Extends the ProbDistI interface, requires a trigram FreqDist instance to train on. Optionally, a different from default discount value can be specified. The default discount is set to 0.75. """ def __init__(self, freqdist, bins=None, discount=0.75): """ :param freqdist: The trigram frequency distribution upon which to base the estimation :type freqdist: FreqDist :param bins: Included for compatibility with nltk.tag.hmm :type bins: int or float :param discount: The discount applied when retrieving counts of trigrams :type discount: float (preferred, but can be set to int) """ if not bins: self._bins = freqdist.B() else: self._bins = bins self._D = discount # cache for probability calculation self._cache = {} # internal bigram and trigram frequency distributions self._bigrams = defaultdict(int) self._trigrams = freqdist # helper dictionaries used to calculate probabilities self._wordtypes_after = defaultdict(float) self._trigrams_contain = defaultdict(float) self._wordtypes_before = defaultdict(float) for w0, w1, w2 in freqdist: self._bigrams[(w0,w1)] += freqdist[(w0, w1, w2)] self._wordtypes_after[(w0,w1)] += 1 self._trigrams_contain[w1] += 1 self._wordtypes_before[(w1,w2)] += 1 def prob(self, trigram): # sample must be a triple if len(trigram) != 3: raise ValueError('Expected an iterable with 3 members.') trigram = tuple(trigram) w0, w1, w2 = trigram if trigram in self._cache: return self._cache[trigram] else: # if the sample trigram was seen during training if trigram in self._trigrams: prob = (self._trigrams[trigram] - self.discount())/self._bigrams[(w0, w1)] # else if the 'rougher' environment was seen during training elif (w0,w1) in self._bigrams and (w1,w2) in self._wordtypes_before: aftr = self._wordtypes_after[(w0, w1)] bfr = self._wordtypes_before[(w1, w2)] # the probability left over from alphas leftover_prob = ((aftr * self.discount()) / self._bigrams[(w0, w1)]) # the beta (including normalization) beta = bfr /(self._trigrams_contain[w1] - aftr) prob = leftover_prob * beta # else the sample was completely unseen during training else: prob = 0.0 self._cache[trigram] = prob return prob def discount(self): """ Return the value by which counts are discounted. By default set to 0.75. :rtype: float """ return self._D def set_discount(self, discount): """ Set the value by which counts are discounted to the value of discount. :param discount: the new value to discount counts by :type discount: float (preferred, but int possible) :rtype: None """ self._D = discount def samples(self): return self._trigrams.keys() def max(self): return self._trigrams.max() def __repr__(self): ''' Return a string representation of this ProbDist :rtype: str ''' return '>> from nltk.probability import ConditionalFreqDist >>> from nltk.tokenize import word_tokenize >>> sent = "the the the dog dog some other words that we do not care about" >>> cfdist = ConditionalFreqDist() >>> for word in word_tokenize(sent): ... condition = len(word) ... cfdist[condition][word] += 1 An equivalent way to do this is with the initializer: >>> cfdist = ConditionalFreqDist((len(word), word) for word in word_tokenize(sent)) The frequency distribution for each condition is accessed using the indexing operator: >>> cfdist[3] FreqDist({'the': 3, 'dog': 2, 'not': 1}) >>> cfdist[3].freq('the') 0.5 >>> cfdist[3]['dog'] 2 When the indexing operator is used to access the frequency distribution for a condition that has not been accessed before, ``ConditionalFreqDist`` creates a new empty FreqDist for that condition. """ def __init__(self, cond_samples=None): """ Construct a new empty conditional frequency distribution. In particular, the count for every sample, under every condition, is zero. :param cond_samples: The samples to initialize the conditional frequency distribution with :type cond_samples: Sequence of (condition, sample) tuples """ defaultdict.__init__(self, FreqDist) if cond_samples: for (cond, sample) in cond_samples: self[cond][sample] += 1 def __reduce__(self): kv_pairs = ((cond, self[cond]) for cond in self.conditions()) return (self.__class__, (), None, None, kv_pairs) def conditions(self): """ Return a list of the conditions that have been accessed for this ``ConditionalFreqDist``. Use the indexing operator to access the frequency distribution for a given condition. Note that the frequency distributions for some conditions may contain zero sample outcomes. :rtype: list """ return list(self.keys()) def N(self): """ Return the total number of sample outcomes that have been recorded by this ``ConditionalFreqDist``. :rtype: int """ return sum(fdist.N() for fdist in compat.itervalues(self)) def plot(self, *args, **kwargs): """ Plot the given samples from the conditional frequency distribution. For a cumulative plot, specify cumulative=True. (Requires Matplotlib to be installed.) :param samples: The samples to plot :type samples: list :param title: The title for the graph :type title: str :param conditions: The conditions to plot (default is all) :type conditions: list """ try: from matplotlib import pylab except ImportError: raise ValueError('The plot function requires matplotlib to be installed.' 'See http://matplotlib.org/') cumulative = _get_kwarg(kwargs, 'cumulative', False) conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions())) title = _get_kwarg(kwargs, 'title', '') samples = _get_kwarg(kwargs, 'samples', sorted(set(v for c in conditions for v in self[c]))) # this computation could be wasted if not "linewidth" in kwargs: kwargs["linewidth"] = 2 for condition in conditions: if cumulative: freqs = list(self[condition]._cumulative_frequencies(samples)) ylabel = "Cumulative Counts" legend_loc = 'lower right' else: freqs = [self[condition][sample] for sample in samples] ylabel = "Counts" legend_loc = 'upper right' # percents = [f * 100 for f in freqs] only in ConditionalProbDist? kwargs['label'] = "%s" % condition pylab.plot(freqs, *args, **kwargs) pylab.legend(loc=legend_loc) pylab.grid(True, color="silver") pylab.xticks(range(len(samples)), [compat.text_type(s) for s in samples], rotation=90) if title: pylab.title(title) pylab.xlabel("Samples") pylab.ylabel(ylabel) pylab.show() def tabulate(self, *args, **kwargs): """ Tabulate the given samples from the conditional frequency distribution. :param samples: The samples to plot :type samples: list :param title: The title for the graph :type title: str :param conditions: The conditions to plot (default is all) :type conditions: list """ cumulative = _get_kwarg(kwargs, 'cumulative', False) conditions = _get_kwarg(kwargs, 'conditions', sorted(self.conditions())) samples = _get_kwarg(kwargs, 'samples', sorted(set(v for c in conditions for v in self[c]))) # this computation could be wasted condition_size = max(len("%s" % c) for c in conditions) print(' ' * condition_size, end=' ') for s in samples: print("%4s" % s, end=' ') print() for c in conditions: print("%*s" % (condition_size, c), end=' ') if cumulative: freqs = list(self[c]._cumulative_frequencies(samples)) else: freqs = [self[c][sample] for sample in samples] for f in freqs: print("%4d" % f, end=' ') print() # @total_ordering doesn't work here, since the class inherits from a builtin class def __le__(self, other): if not isinstance(other, ConditionalFreqDist): raise_unorderable_types("<=", self, other) return set(self.conditions()).issubset(other.conditions()) \ and all(self[c] <= other[c] for c in self.conditions()) def __lt__(self, other): if not isinstance(other, ConditionalFreqDist): raise_unorderable_types("<", self, other) return self <= other and self != other def __ge__(self, other): if not isinstance(other, ConditionalFreqDist): raise_unorderable_types(">=", self, other) return other <= self def __gt__(self, other): if not isinstance(other, ConditionalFreqDist): raise_unorderable_types(">", self, other) return other < self def __repr__(self): """ Return a string representation of this ``ConditionalFreqDist``. :rtype: str """ return '' % len(self) @compat.python_2_unicode_compatible class ConditionalProbDistI(dict): """ A collection of probability distributions for a single experiment run under different conditions. Conditional probability distributions are used to estimate the likelihood of each sample, given the condition under which the experiment was run. For example, a conditional probability distribution could be used to estimate the probability of each word type in a document, given the length of the word type. Formally, a conditional probability distribution can be defined as a function that maps from each condition to the ``ProbDist`` for the experiment under that condition. """ def __init__(self): raise NotImplementedError("Interfaces can't be instantiated") def conditions(self): """ Return a list of the conditions that are represented by this ``ConditionalProbDist``. Use the indexing operator to access the probability distribution for a given condition. :rtype: list """ return list(self.keys()) def __repr__(self): """ Return a string representation of this ``ConditionalProbDist``. :rtype: str """ return '<%s with %d conditions>' % (type(self).__name__, len(self)) class ConditionalProbDist(ConditionalProbDistI): """ A conditional probability distribution modeling the experiments that were used to generate a conditional frequency distribution. A ConditionalProbDist is constructed from a ``ConditionalFreqDist`` and a ``ProbDist`` factory: - The ``ConditionalFreqDist`` specifies the frequency distribution for each condition. - The ``ProbDist`` factory is a function that takes a condition's frequency distribution, and returns its probability distribution. A ``ProbDist`` class's name (such as ``MLEProbDist`` or ``HeldoutProbDist``) can be used to specify that class's constructor. The first argument to the ``ProbDist`` factory is the frequency distribution that it should model; and the remaining arguments are specified by the ``factory_args`` parameter to the ``ConditionalProbDist`` constructor. For example, the following code constructs a ``ConditionalProbDist``, where the probability distribution for each condition is an ``ELEProbDist`` with 10 bins: >>> from nltk.corpus import brown >>> from nltk.probability import ConditionalFreqDist >>> from nltk.probability import ConditionalProbDist, ELEProbDist >>> cfdist = ConditionalFreqDist(brown.tagged_words()[:5000]) >>> cpdist = ConditionalProbDist(cfdist, ELEProbDist, 10) >>> cpdist['passed'].max() 'VBD' >>> cpdist['passed'].prob('VBD') 0.423... """ def __init__(self, cfdist, probdist_factory, *factory_args, **factory_kw_args): """ Construct a new conditional probability distribution, based on the given conditional frequency distribution and ``ProbDist`` factory. :type cfdist: ConditionalFreqDist :param cfdist: The ``ConditionalFreqDist`` specifying the frequency distribution for each condition. :type probdist_factory: class or function :param probdist_factory: The function or class that maps a condition's frequency distribution to its probability distribution. The function is called with the frequency distribution as its first argument, ``factory_args`` as its remaining arguments, and ``factory_kw_args`` as keyword arguments. :type factory_args: (any) :param factory_args: Extra arguments for ``probdist_factory``. These arguments are usually used to specify extra properties for the probability distributions of individual conditions, such as the number of bins they contain. :type factory_kw_args: (any) :param factory_kw_args: Extra keyword arguments for ``probdist_factory``. """ self._probdist_factory = probdist_factory self._factory_args = factory_args self._factory_kw_args = factory_kw_args for condition in cfdist: self[condition] = probdist_factory(cfdist[condition], *factory_args, **factory_kw_args) def __missing__(self, key): self[key] = self._probdist_factory(FreqDist(), *self._factory_args, **self._factory_kw_args) return self[key] class DictionaryConditionalProbDist(ConditionalProbDistI): """ An alternative ConditionalProbDist that simply wraps a dictionary of ProbDists rather than creating these from FreqDists. """ def __init__(self, probdist_dict): """ :param probdist_dict: a dictionary containing the probdists indexed by the conditions :type probdist_dict: dict any -> probdist """ self.update(probdist_dict) def __missing__(self, key): self[key] = DictionaryProbDist() return self[key] ##////////////////////////////////////////////////////// ## Adding in log-space. ##////////////////////////////////////////////////////// # If the difference is bigger than this, then just take the bigger one: _ADD_LOGS_MAX_DIFF = math.log(1e-30, 2) def add_logs(logx, logy): """ Given two numbers ``logx`` = *log(x)* and ``logy`` = *log(y)*, return *log(x+y)*. Conceptually, this is the same as returning ``log(2**(logx)+2**(logy))``, but the actual implementation avoids overflow errors that could result from direct computation. """ if (logx < logy + _ADD_LOGS_MAX_DIFF): return logy if (logy < logx + _ADD_LOGS_MAX_DIFF): return logx base = min(logx, logy) return base + math.log(2**(logx-base) + 2**(logy-base), 2) def sum_logs(logs): return (reduce(add_logs, logs[1:], logs[0]) if len(logs) != 0 else _NINF) ##////////////////////////////////////////////////////// ## Probabilistic Mix-in ##////////////////////////////////////////////////////// class ProbabilisticMixIn(object): """ A mix-in class to associate probabilities with other classes (trees, rules, etc.). To use the ``ProbabilisticMixIn`` class, define a new class that derives from an existing class and from ProbabilisticMixIn. You will need to define a new constructor for the new class, which explicitly calls the constructors of both its parent classes. For example: >>> from nltk.probability import ProbabilisticMixIn >>> class A: ... def __init__(self, x, y): self.data = (x,y) ... >>> class ProbabilisticA(A, ProbabilisticMixIn): ... def __init__(self, x, y, **prob_kwarg): ... A.__init__(self, x, y) ... ProbabilisticMixIn.__init__(self, **prob_kwarg) See the documentation for the ProbabilisticMixIn ``constructor<__init__>`` for information about the arguments it expects. You should generally also redefine the string representation methods, the comparison methods, and the hashing method. """ def __init__(self, **kwargs): """ Initialize this object's probability. This initializer should be called by subclass constructors. ``prob`` should generally be the first argument for those constructors. :param prob: The probability associated with the object. :type prob: float :param logprob: The log of the probability associated with the object. :type logprob: float """ if 'prob' in kwargs: if 'logprob' in kwargs: raise TypeError('Must specify either prob or logprob ' '(not both)') else: ProbabilisticMixIn.set_prob(self, kwargs['prob']) elif 'logprob' in kwargs: ProbabilisticMixIn.set_logprob(self, kwargs['logprob']) else: self.__prob = self.__logprob = None def set_prob(self, prob): """ Set the probability associated with this object to ``prob``. :param prob: The new probability :type prob: float """ self.__prob = prob self.__logprob = None def set_logprob(self, logprob): """ Set the log probability associated with this object to ``logprob``. I.e., set the probability associated with this object to ``2**(logprob)``. :param logprob: The new log probability :type logprob: float """ self.__logprob = logprob self.__prob = None def prob(self): """ Return the probability associated with this object. :rtype: float """ if self.__prob is None: if self.__logprob is None: return None self.__prob = 2**(self.__logprob) return self.__prob def logprob(self): """ Return ``log(p)``, where ``p`` is the probability associated with this object. :rtype: float """ if self.__logprob is None: if self.__prob is None: return None self.__logprob = math.log(self.__prob, 2) return self.__logprob class ImmutableProbabilisticMixIn(ProbabilisticMixIn): def set_prob(self, prob): raise ValueError('%s is immutable' % self.__class__.__name__) def set_logprob(self, prob): raise ValueError('%s is immutable' % self.__class__.__name__) ## Helper function for processing keyword arguments def _get_kwarg(kwargs, key, default): if key in kwargs: arg = kwargs[key] del kwargs[key] else: arg = default return arg ##////////////////////////////////////////////////////// ## Demonstration ##////////////////////////////////////////////////////// def _create_rand_fdist(numsamples, numoutcomes): """ Create a new frequency distribution, with random samples. The samples are numbers from 1 to ``numsamples``, and are generated by summing two numbers, each of which has a uniform distribution. """ import random fdist = FreqDist() for x in range(numoutcomes): y = (random.randint(1, (1 + numsamples) // 2) + random.randint(0, numsamples // 2)) fdist[y] += 1 return fdist def _create_sum_pdist(numsamples): """ Return the true probability distribution for the experiment ``_create_rand_fdist(numsamples, x)``. """ fdist = FreqDist() for x in range(1, (1 + numsamples) // 2 + 1): for y in range(0, numsamples // 2 + 1): fdist[x+y] += 1 return MLEProbDist(fdist) def demo(numsamples=6, numoutcomes=500): """ A demonstration of frequency distributions and probability distributions. This demonstration creates three frequency distributions with, and uses them to sample a random process with ``numsamples`` samples. Each frequency distribution is sampled ``numoutcomes`` times. These three frequency distributions are then used to build six probability distributions. Finally, the probability estimates of these distributions are compared to the actual probability of each sample. :type numsamples: int :param numsamples: The number of samples to use in each demo frequency distributions. :type numoutcomes: int :param numoutcomes: The total number of outcomes for each demo frequency distribution. These outcomes are divided into ``numsamples`` bins. :rtype: None """ # Randomly sample a stochastic process three times. fdist1 = _create_rand_fdist(numsamples, numoutcomes) fdist2 = _create_rand_fdist(numsamples, numoutcomes) fdist3 = _create_rand_fdist(numsamples, numoutcomes) # Use our samples to create probability distributions. pdists = [ MLEProbDist(fdist1), LidstoneProbDist(fdist1, 0.5, numsamples), HeldoutProbDist(fdist1, fdist2, numsamples), HeldoutProbDist(fdist2, fdist1, numsamples), CrossValidationProbDist([fdist1, fdist2, fdist3], numsamples), SimpleGoodTuringProbDist(fdist1), SimpleGoodTuringProbDist(fdist1, 7), _create_sum_pdist(numsamples), ] # Find the probability of each sample. vals = [] for n in range(1,numsamples+1): vals.append(tuple([n, fdist1.freq(n)] + [pdist.prob(n) for pdist in pdists])) # Print the results in a formatted table. print(('%d samples (1-%d); %d outcomes were sampled for each FreqDist' % (numsamples, numsamples, numoutcomes))) print('='*9*(len(pdists)+2)) FORMATSTR = ' FreqDist '+ '%8s '*(len(pdists)-1) + '| Actual' print(FORMATSTR % tuple(repr(pdist)[1:9] for pdist in pdists[:-1])) print('-'*9*(len(pdists)+2)) FORMATSTR = '%3d %8.6f ' + '%8.6f '*(len(pdists)-1) + '| %8.6f' for val in vals: print(FORMATSTR % val) # Print the totals for each column (should all be 1.0) zvals = list(zip(*vals)) sums = [sum(val) for val in zvals[1:]] print('-'*9*(len(pdists)+2)) FORMATSTR = 'Total ' + '%8.6f '*(len(pdists)) + '| %8.6f' print(FORMATSTR % tuple(sums)) print('='*9*(len(pdists)+2)) # Display the distributions themselves, if they're short enough. if len("%s" % fdist1) < 70: print(' fdist1: %s' % fdist1) print(' fdist2: %s' % fdist2) print(' fdist3: %s' % fdist3) print() print('Generating:') for pdist in pdists: fdist = FreqDist(pdist.generate() for i in range(5000)) print('%20s %s' % (pdist.__class__.__name__[:20], ("%s" % fdist)[:55])) print() def gt_demo(): from nltk import corpus emma_words = corpus.gutenberg.words('austen-emma.txt') fd = FreqDist(emma_words) sgt = SimpleGoodTuringProbDist(fd) print('%18s %8s %14s' \ % ("word", "freqency", "SimpleGoodTuring")) fd_keys_sorted=(key for key, value in sorted(fd.items(), key=lambda item: item[1], reverse=True)) for key in fd_keys_sorted: print('%18s %8d %14e' \ % (key, fd[key], sgt.prob(key))) if __name__ == '__main__': demo(6, 10) demo(5, 5000) gt_demo() __all__ = ['ConditionalFreqDist', 'ConditionalProbDist', 'ConditionalProbDistI', 'CrossValidationProbDist', 'DictionaryConditionalProbDist', 'DictionaryProbDist', 'ELEProbDist', 'FreqDist', 'SimpleGoodTuringProbDist', 'HeldoutProbDist', 'ImmutableProbabilisticMixIn', 'LaplaceProbDist', 'LidstoneProbDist', 'MLEProbDist', 'MutableProbDist', 'KneserNeyProbDist', 'ProbDistI', 'ProbabilisticMixIn', 'UniformProbDist', 'WittenBellProbDist', 'add_logs', 'log_likelihood', 'sum_logs', 'entropy'] nltk-3.1/nltk/sem/0000755000076500000240000000000012610001541013576 5ustar sbstaff00000000000000nltk-3.1/nltk/sem/__init__.py0000644000076500000240000000455112607224144015730 0ustar sbstaff00000000000000# Natural Language Toolkit: Semantic Interpretation # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ NLTK Semantic Interpretation Package This package contains classes for representing semantic structure in formulas of first-order logic and for evaluating such formulas in set-theoretic models. >>> from nltk.sem import logic >>> logic._counter._value = 0 The package has two main components: - ``logic`` provides support for analyzing expressions of First Order Logic (FOL). - ``evaluate`` allows users to recursively determine truth in a model for formulas of FOL. A model consists of a domain of discourse and a valuation function, which assigns values to non-logical constants. We assume that entities in the domain are represented as strings such as ``'b1'``, ``'g1'``, etc. A ``Valuation`` is initialized with a list of (symbol, value) pairs, where values are entities, sets of entities or sets of tuples of entities. The domain of discourse can be inferred from the valuation, and model is then created with domain and valuation as parameters. >>> from nltk.sem import Valuation, Model >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ... ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] >>> val = Valuation(v) >>> dom = val.domain >>> m = Model(dom, val) """ from nltk.sem.util import (parse_sents, interpret_sents, evaluate_sents, root_semrep) from nltk.sem.evaluate import (Valuation, Assignment, Model, Undefined, is_rel, set2rel, arity, read_valuation) from nltk.sem.logic import (boolean_ops, binding_ops, equality_preds, read_logic, Variable, Expression, ApplicationExpression, LogicalExpressionException) from nltk.sem.skolemize import skolemize from nltk.sem.lfg import FStructure from nltk.sem.relextract import (extract_rels, rtuple, clause) from nltk.sem.boxer import Boxer from nltk.sem.drt import DrtExpression, DRS # from nltk.sem.glue import Glue # from nltk.sem.hole import HoleSemantics # from nltk.sem.cooper_storage import CooperStore # don't import chat80 as its names are too generic nltk-3.1/nltk/sem/boxer.py0000644000076500000240000014075012607224144015312 0ustar sbstaff00000000000000# Natural Language Toolkit: Interface to Boxer # # # Author: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT """ An interface to Boxer. This interface relies on the latest version of the development (subversion) version of C&C and Boxer. Usage: Set the environment variable CANDC to the bin directory of your CandC installation. The models directory should be in the CandC root directory. For example: /path/to/candc/ bin/ candc boxer models/ boxer/ """ from __future__ import print_function, unicode_literals import os import re import operator import subprocess from optparse import OptionParser import tempfile from functools import reduce from nltk.internals import Counter, find_binary from nltk.sem.logic import (ExpectedMoreTokensException, LogicalExpressionException, UnexpectedTokenException, Variable) from nltk.sem.drt import (DRS, DrtApplicationExpression, DrtEqualityExpression, DrtNegatedExpression, DrtOrExpression, DrtParser, DrtProposition, DrtTokens, DrtVariableExpression) from nltk.compat import python_2_unicode_compatible class Boxer(object): """ This class is an interface to Johan Bos's program Boxer, a wide-coverage semantic parser that produces Discourse Representation Structures (DRSs). """ def __init__(self, boxer_drs_interpreter=None, elimeq=False, bin_dir=None, verbose=False, resolve=True): """ :param boxer_drs_interpreter: A class that converts from the ``AbstractBoxerDrs`` object hierarchy to a different object. The default is ``NltkDrtBoxerDrsInterpreter``, which converts to the NLTK DRT hierarchy. :param elimeq: When set to true, Boxer removes all equalities from the DRSs and discourse referents standing in the equality relation are unified, but only if this can be done in a meaning-preserving manner. :param resolve: When set to true, Boxer will resolve all anaphoric DRSs and perform merge-reduction. Resolution follows Van der Sandt's theory of binding and accommodation. """ if boxer_drs_interpreter is None: boxer_drs_interpreter = NltkDrtBoxerDrsInterpreter() self._boxer_drs_interpreter = boxer_drs_interpreter self._resolve = resolve self._elimeq = elimeq self.set_bin_dir(bin_dir, verbose) def set_bin_dir(self, bin_dir, verbose=False): self._candc_bin = self._find_binary('candc', bin_dir, verbose) self._candc_models_path = os.path.normpath(os.path.join(self._candc_bin[:-5], '../models')) self._boxer_bin = self._find_binary('boxer', bin_dir, verbose) def interpret(self, input, discourse_id=None, question=False, verbose=False): """ Use Boxer to give a first order representation. :param input: str Input sentence to parse :param occur_index: bool Should predicates be occurrence indexed? :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. :return: ``drt.DrtExpression`` """ discourse_ids = ([discourse_id] if discourse_id is not None else None) d, = self.interpret_multi_sents([[input]], discourse_ids, question, verbose) if not d: raise Exception('Unable to interpret: "%s"' % input) return d def interpret_multi(self, input, discourse_id=None, question=False, verbose=False): """ Use Boxer to give a first order representation. :param input: list of str Input sentences to parse as a single discourse :param occur_index: bool Should predicates be occurrence indexed? :param discourse_id: str An identifier to be inserted to each occurrence-indexed predicate. :return: ``drt.DrtExpression`` """ discourse_ids = ([discourse_id] if discourse_id is not None else None) d, = self.interpret_multi_sents([input], discourse_ids, question, verbose) if not d: raise Exception('Unable to interpret: "%s"' % input) return d def interpret_sents(self, inputs, discourse_ids=None, question=False, verbose=False): """ Use Boxer to give a first order representation. :param inputs: list of str Input sentences to parse as individual discourses :param occur_index: bool Should predicates be occurrence indexed? :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :return: list of ``drt.DrtExpression`` """ return self.interpret_multi_sents([[input] for input in inputs], discourse_ids, question, verbose) def interpret_multi_sents(self, inputs, discourse_ids=None, question=False, verbose=False): """ Use Boxer to give a first order representation. :param inputs: list of list of str Input discourses to parse :param occur_index: bool Should predicates be occurrence indexed? :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :return: ``drt.DrtExpression`` """ if discourse_ids is not None: assert len(inputs) == len(discourse_ids) assert reduce(operator.and_, (id is not None for id in discourse_ids)) use_disc_id = True else: discourse_ids = list(map(str, range(len(inputs)))) use_disc_id = False candc_out = self._call_candc(inputs, discourse_ids, question, verbose=verbose) boxer_out = self._call_boxer(candc_out, verbose=verbose) # if 'ERROR: input file contains no ccg/2 terms.' in boxer_out: # raise UnparseableInputException('Could not parse with candc: "%s"' % input_str) drs_dict = self._parse_to_drs_dict(boxer_out, use_disc_id) return [drs_dict.get(id, None) for id in discourse_ids] def _call_candc(self, inputs, discourse_ids, question, verbose=False): """ Call the ``candc`` binary with the given input. :param inputs: list of list of str Input discourses to parse :param discourse_ids: list of str Identifiers to be inserted to each occurrence-indexed predicate. :param filename: str A filename for the output file :return: stdout """ args = ['--models', os.path.join(self._candc_models_path, ['boxer','questions'][question]), '--candc-printer', 'boxer'] return self._call('\n'.join(sum((["'%s'" % id] + d for d,id in zip(inputs,discourse_ids)), [])), self._candc_bin, args, verbose) def _call_boxer(self, candc_out, verbose=False): """ Call the ``boxer`` binary with the given input. :param candc_out: str output from C&C parser :return: stdout """ f = None try: fd, temp_filename = tempfile.mkstemp(prefix='boxer-', suffix='.in', text=True) f = os.fdopen(fd, 'w') f.write(candc_out) finally: if f: f.close() args = ['--box', 'false', '--semantics', 'drs', #'--flat', 'false', # removed from boxer '--resolve', ['false','true'][self._resolve], '--elimeq', ['false','true'][self._elimeq], '--format', 'prolog', '--instantiate', 'true', '--input', temp_filename] stdout = self._call(None, self._boxer_bin, args, verbose) os.remove(temp_filename) return stdout def _find_binary(self, name, bin_dir, verbose=False): return find_binary(name, path_to_bin=bin_dir, env_vars=['CANDC'], url='http://svn.ask.it.usyd.edu.au/trac/candc/', binary_names=[name, name + '.exe'], verbose=verbose) def _call(self, input_str, binary, args=[], verbose=False): """ Call the binary with the given input. :param input_str: A string whose contents are used as stdin. :param binary: The location of the binary to call :param args: A list of command-line arguments. :return: stdout """ if verbose: print('Calling:', binary) print('Args:', args) print('Input:', input_str) print('Command:', binary + ' ' + ' '.join(args)) # Call via a subprocess if input_str is None: cmd = [binary] + args p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) else: cmd = 'echo "%s" | %s %s' % (input_str, binary, ' '.join(args)) p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) stdout, stderr = p.communicate() if verbose: print('Return code:', p.returncode) if stdout: print('stdout:\n', stdout, '\n') if stderr: print('stderr:\n', stderr, '\n') if p.returncode != 0: raise Exception('ERROR CALLING: %s %s\nReturncode: %d\n%s' % (binary, ' '.join(args), p.returncode, stderr)) return stdout def _parse_to_drs_dict(self, boxer_out, use_disc_id): lines = boxer_out.split('\n') drs_dict = {} i = 0 while i < len(lines): line = lines[i] if line.startswith('id('): comma_idx = line.index(',') discourse_id = line[3:comma_idx] if discourse_id[0] == "'" and discourse_id[-1] == "'": discourse_id = discourse_id[1:-1] drs_id = line[comma_idx+1:line.index(')')] i += 1 line = lines[i] assert line.startswith('sem(%s,' % drs_id) if line[-4:] == "').'": line = line[:-4] + ")." assert line.endswith(').'), "can't parse line: %s" % line search_start = len('sem(%s,[' % drs_id) brace_count = 1 drs_start = -1 for j,c in enumerate(line[search_start:]): if(c == '['): brace_count += 1 if(c == ']'): brace_count -= 1 if(brace_count == 0): drs_start = search_start + j + 1 if line[drs_start:drs_start+3] == "','": drs_start = drs_start + 3 else: drs_start = drs_start + 1 break assert drs_start > -1 drs_input = line[drs_start:-2].strip() parsed = self._parse_drs(drs_input, discourse_id, use_disc_id) drs_dict[discourse_id] = self._boxer_drs_interpreter.interpret(parsed) i += 1 return drs_dict def _parse_drs(self, drs_string, discourse_id, use_disc_id): return BoxerOutputDrsParser([None,discourse_id][use_disc_id]).parse(drs_string) class BoxerOutputDrsParser(DrtParser): def __init__(self, discourse_id=None): """ This class is used to parse the Prolog DRS output from Boxer into a hierarchy of python objects. """ DrtParser.__init__(self) self.discourse_id = discourse_id self.sentence_id_offset = None self.quote_chars = [("'", "'", "\\", False)] def parse(self, data, signature=None): return DrtParser.parse(self, data, signature) def get_all_symbols(self): return ['(', ')', ',', '[', ']',':'] def handle(self, tok, context): return self.handle_drs(tok) def attempt_adjuncts(self, expression, context): return expression def parse_condition(self, indices): """ Parse a DRS condition :return: list of ``DrtExpression`` """ tok = self.token() accum = self.handle_condition(tok, indices) if accum is None: raise UnexpectedTokenException(tok) return accum def handle_drs(self, tok): if tok == 'drs': return self.parse_drs() elif tok in ['merge', 'smerge']: return self._handle_binary_expression(self._make_merge_expression)(None, []) elif tok in ['alfa']: return self._handle_alfa(self._make_merge_expression)(None, []) def handle_condition(self, tok, indices): """ Handle a DRS condition :param indices: list of int :return: list of ``DrtExpression`` """ if tok == 'not': return [self._handle_not()] if tok == 'or': conds = [self._handle_binary_expression(self._make_or_expression)] elif tok == 'imp': conds = [self._handle_binary_expression(self._make_imp_expression)] elif tok == 'eq': conds = [self._handle_eq()] elif tok == 'prop': conds = [self._handle_prop()] elif tok == 'pred': conds = [self._handle_pred()] elif tok == 'named': conds = [self._handle_named()] elif tok == 'rel': conds = [self._handle_rel()] elif tok == 'timex': conds = self._handle_timex() elif tok == 'card': conds = [self._handle_card()] elif tok == 'whq': conds = [self._handle_whq()] elif tok == 'duplex': conds = [self._handle_duplex()] else: conds = [] return sum([[cond(sent_index, word_indices) for cond in conds] for sent_index, word_indices in self._sent_and_word_indices(indices)], []) def _handle_not(self): self.assertToken(self.token(), '(') drs = self.process_next_expression(None) self.assertToken(self.token(), ')') return BoxerNot(drs) def _handle_pred(self): #pred(_G3943, dog, n, 0) self.assertToken(self.token(), '(') variable = self.parse_variable() self.assertToken(self.token(), ',') name = self.token() self.assertToken(self.token(), ',') pos = self.token() self.assertToken(self.token(), ',') sense = int(self.token()) self.assertToken(self.token(), ')') def _handle_pred_f(sent_index, word_indices): return BoxerPred(self.discourse_id, sent_index, word_indices, variable, name, pos, sense) return _handle_pred_f def _handle_duplex(self): #duplex(whq, drs(...), var, drs(...)) self.assertToken(self.token(), '(') # self.assertToken(self.token(), '[') ans_types = [] # while self.token(0) != ']': # cat = self.token() # self.assertToken(self.token(), ':') # if cat == 'des': # ans_types.append(self.token()) # elif cat == 'num': # ans_types.append('number') # typ = self.token() # if typ == 'cou': # ans_types.append('count') # else: # ans_types.append(typ) # else: # ans_types.append(self.token()) # self.token() #swallow the ']' self.assertToken(self.token(), 'whq') self.assertToken(self.token(), ',') d1 = self.process_next_expression(None) self.assertToken(self.token(), ',') ref = self.parse_variable() self.assertToken(self.token(), ',') d2 = self.process_next_expression(None) self.assertToken(self.token(), ')') return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2) def _handle_named(self): #named(x0, john, per, 0) self.assertToken(self.token(), '(') variable = self.parse_variable() self.assertToken(self.token(), ',') name = self.token() self.assertToken(self.token(), ',') type = self.token() self.assertToken(self.token(), ',') sense = self.token() # as per boxer rev 2554 self.assertToken(self.token(), ')') return lambda sent_index, word_indices: BoxerNamed(self.discourse_id, sent_index, word_indices, variable, name, type, sense) def _handle_rel(self): #rel(_G3993, _G3943, agent, 0) self.assertToken(self.token(), '(') var1 = self.parse_variable() self.assertToken(self.token(), ',') var2 = self.parse_variable() self.assertToken(self.token(), ',') rel = self.token() self.assertToken(self.token(), ',') sense = int(self.token()) self.assertToken(self.token(), ')') return lambda sent_index, word_indices: BoxerRel(self.discourse_id, sent_index, word_indices, var1, var2, rel, sense) def _handle_timex(self): #timex(_G18322, date([]: (+), []:'XXXX', [1004]:'04', []:'XX')) self.assertToken(self.token(), '(') arg = self.parse_variable() self.assertToken(self.token(), ',') new_conds = self._handle_time_expression(arg) self.assertToken(self.token(), ')') return new_conds def _handle_time_expression(self, arg): #date([]: (+), []:'XXXX', [1004]:'04', []:'XX') tok = self.token() self.assertToken(self.token(), '(') if tok == 'date': conds = self._handle_date(arg) elif tok == 'time': conds = self._handle_time(arg) else: return None self.assertToken(self.token(), ')') return [lambda sent_index, word_indices: BoxerPred(self.discourse_id, sent_index, word_indices, arg, tok, 'n', 0)] + \ [lambda sent_index, word_indices: cond for cond in conds] def _handle_date(self, arg): #[]: (+), []:'XXXX', [1004]:'04', []:'XX' conds = [] (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list()) self.assertToken(self.token(), '(') pol = self.token() self.assertToken(self.token(), ')') conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_pol_%s' % (pol), 'a', 0)) self.assertToken(self.token(), ',') (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list()) year = self.token() if year != 'XXXX': year = year.replace(':', '_') conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_year_%s' % (year), 'a', 0)) self.assertToken(self.token(), ',') (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list()) month = self.token() if month != 'XX': conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_month_%s' % (month), 'a', 0)) self.assertToken(self.token(), ',') (sent_index, word_indices), = self._sent_and_word_indices(self._parse_index_list()) day = self.token() if day != 'XX': conds.append(BoxerPred(self.discourse_id, sent_index, word_indices, arg, 'date_day_%s' % (day), 'a', 0)) return conds def _handle_time(self, arg): #time([1018]:'18', []:'XX', []:'XX') conds = [] self._parse_index_list() hour = self.token() if hour != 'XX': conds.append(self._make_atom('r_hour_2',arg,hour)) self.assertToken(self.token(), ',') self._parse_index_list() min = self.token() if min != 'XX': conds.append(self._make_atom('r_min_2',arg,min)) self.assertToken(self.token(), ',') self._parse_index_list() sec = self.token() if sec != 'XX': conds.append(self._make_atom('r_sec_2',arg,sec)) return conds def _handle_card(self): #card(_G18535, 28, ge) self.assertToken(self.token(), '(') variable = self.parse_variable() self.assertToken(self.token(), ',') value = self.token() self.assertToken(self.token(), ',') type = self.token() self.assertToken(self.token(), ')') return lambda sent_index, word_indices: BoxerCard(self.discourse_id, sent_index, word_indices, variable, value, type) def _handle_prop(self): #prop(_G15949, drs(...)) self.assertToken(self.token(), '(') variable = self.parse_variable() self.assertToken(self.token(), ',') drs = self.process_next_expression(None) self.assertToken(self.token(), ')') return lambda sent_index, word_indices: BoxerProp(self.discourse_id, sent_index, word_indices, variable, drs) def _parse_index_list(self): #[1001,1002]: indices = [] self.assertToken(self.token(), '[') while self.token(0) != ']': indices.append(self.parse_index()) if self.token(0) == ',': self.token() #swallow ',' self.token() #swallow ']' self.assertToken(self.token(), ':') return indices def parse_drs(self): #drs([[1001]:_G3943], # [[1002]:pred(_G3943, dog, n, 0)] # ) self.assertToken(self.token(), '(') self.assertToken(self.token(), '[') refs = set() while self.token(0) != ']': indices = self._parse_index_list() refs.add(self.parse_variable()) if self.token(0) == ',': self.token() #swallow ',' self.token() #swallow ']' self.assertToken(self.token(), ',') self.assertToken(self.token(), '[') conds = [] while self.token(0) != ']': indices = self._parse_index_list() conds.extend(self.parse_condition(indices)) if self.token(0) == ',': self.token() #swallow ',' self.token() #swallow ']' self.assertToken(self.token(), ')') return BoxerDrs(list(refs), conds) def _handle_binary_expression(self, make_callback): self.assertToken(self.token(), '(') drs1 = self.process_next_expression(None) self.assertToken(self.token(), ',') drs2 = self.process_next_expression(None) self.assertToken(self.token(), ')') return lambda sent_index, word_indices: make_callback(sent_index, word_indices, drs1, drs2) def _handle_alfa(self, make_callback): self.assertToken(self.token(), '(') type = self.token() self.assertToken(self.token(), ',') drs1 = self.process_next_expression(None) self.assertToken(self.token(), ',') drs2 = self.process_next_expression(None) self.assertToken(self.token(), ')') return lambda sent_index, word_indices: make_callback(sent_index, word_indices, drs1, drs2) def _handle_eq(self): self.assertToken(self.token(), '(') var1 = self.parse_variable() self.assertToken(self.token(), ',') var2 = self.parse_variable() self.assertToken(self.token(), ')') return lambda sent_index, word_indices: BoxerEq(self.discourse_id, sent_index, word_indices, var1, var2) def _handle_whq(self): self.assertToken(self.token(), '(') self.assertToken(self.token(), '[') ans_types = [] while self.token(0) != ']': cat = self.token() self.assertToken(self.token(), ':') if cat == 'des': ans_types.append(self.token()) elif cat == 'num': ans_types.append('number') typ = self.token() if typ == 'cou': ans_types.append('count') else: ans_types.append(typ) else: ans_types.append(self.token()) self.token() #swallow the ']' self.assertToken(self.token(), ',') d1 = self.process_next_expression(None) self.assertToken(self.token(), ',') ref = self.parse_variable() self.assertToken(self.token(), ',') d2 = self.process_next_expression(None) self.assertToken(self.token(), ')') return lambda sent_index, word_indices: BoxerWhq(self.discourse_id, sent_index, word_indices, ans_types, d1, ref, d2) def _make_merge_expression(self, sent_index, word_indices, drs1, drs2): return BoxerDrs(drs1.refs + drs2.refs, drs1.conds + drs2.conds) def _make_or_expression(self, sent_index, word_indices, drs1, drs2): return BoxerOr(self.discourse_id, sent_index, word_indices, drs1, drs2) def _make_imp_expression(self, sent_index, word_indices, drs1, drs2): return BoxerDrs(drs1.refs, drs1.conds, drs2) def parse_variable(self): var = self.token() assert re.match('^[exps]\d+$', var), var return var def parse_index(self): return int(self.token()) def _sent_and_word_indices(self, indices): """ :return: list of (sent_index, word_indices) tuples """ sent_indices = set((i / 1000)-1 for i in indices if i>=0) if sent_indices: pairs = [] for sent_index in sent_indices: word_indices = [(i % 1000)-1 for i in indices if sent_index == (i / 1000)-1] pairs.append((sent_index, word_indices)) return pairs else: word_indices = [(i % 1000)-1 for i in indices] return [(None, word_indices)] class BoxerDrsParser(DrtParser): """ Reparse the str form of subclasses of ``AbstractBoxerDrs`` """ def __init__(self, discourse_id=None): DrtParser.__init__(self) self.discourse_id = discourse_id def get_all_symbols(self): return [DrtTokens.OPEN, DrtTokens.CLOSE, DrtTokens.COMMA, DrtTokens.OPEN_BRACKET, DrtTokens.CLOSE_BRACKET] def attempt_adjuncts(self, expression, context): return expression def handle(self, tok, context): try: # if tok == 'drs': # self.assertNextToken(DrtTokens.OPEN) # label = int(self.token()) # self.assertNextToken(DrtTokens.COMMA) # refs = list(map(int, self.handle_refs())) # self.assertNextToken(DrtTokens.COMMA) # conds = self.handle_conds(None) # self.assertNextToken(DrtTokens.CLOSE) # return BoxerDrs(label, refs, conds) if tok == 'pred': self.assertNextToken(DrtTokens.OPEN) disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) variable = int(self.token()) self.assertNextToken(DrtTokens.COMMA) name = self.token() self.assertNextToken(DrtTokens.COMMA) pos = self.token() self.assertNextToken(DrtTokens.COMMA) sense = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerPred(disc_id, sent_id, word_ids, variable, name, pos, sense) elif tok == 'named': self.assertNextToken(DrtTokens.OPEN) disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] self.assertNextToken(DrtTokens.COMMA) sent_id = int(self.token()) self.assertNextToken(DrtTokens.COMMA) word_ids = map(int, self.handle_refs()) self.assertNextToken(DrtTokens.COMMA) variable = int(self.token()) self.assertNextToken(DrtTokens.COMMA) name = self.token() self.assertNextToken(DrtTokens.COMMA) type = self.token() self.assertNextToken(DrtTokens.COMMA) sense = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerNamed(disc_id, sent_id, word_ids, variable, name, type, sense) elif tok == 'rel': self.assertNextToken(DrtTokens.OPEN) disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) var1 = int(self.token()) self.assertNextToken(DrtTokens.COMMA) var2 = int(self.token()) self.assertNextToken(DrtTokens.COMMA) rel = self.token() self.assertNextToken(DrtTokens.COMMA) sense = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerRel(disc_id, sent_id, word_ids, var1, var2, rel, sense) elif tok == 'prop': self.assertNextToken(DrtTokens.OPEN) disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] self.assertNextToken(DrtTokens.COMMA) sent_id = int(self.token()) self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) variable = int(self.token()) self.assertNextToken(DrtTokens.COMMA) drs = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerProp(disc_id, sent_id, word_ids, variable, drs) elif tok == 'not': self.assertNextToken(DrtTokens.OPEN) drs = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerNot(drs) elif tok == 'imp': self.assertNextToken(DrtTokens.OPEN) drs1 = self.process_next_expression(None) self.assertNextToken(DrtTokens.COMMA) drs2 = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerDrs(drs1.refs, drs1.conds, drs2) elif tok == 'or': self.assertNextToken(DrtTokens.OPEN) disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = map(int, self.handle_refs()) self.assertNextToken(DrtTokens.COMMA) drs1 = self.process_next_expression(None) self.assertNextToken(DrtTokens.COMMA) drs2 = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerOr(disc_id, sent_id, word_ids, drs1, drs2) elif tok == 'eq': self.assertNextToken(DrtTokens.OPEN) disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) var1 = int(self.token()) self.assertNextToken(DrtTokens.COMMA) var2 = int(self.token()) self.assertNextToken(DrtTokens.CLOSE) return BoxerEq(disc_id, sent_id, word_ids, var1, var2) elif tok == 'card': self.assertNextToken(DrtTokens.OPEN) disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = map(int, self.handle_refs()) self.assertNextToken(DrtTokens.COMMA) var = int(self.token()) self.assertNextToken(DrtTokens.COMMA) value = self.token() self.assertNextToken(DrtTokens.COMMA) type = self.token() self.assertNextToken(DrtTokens.CLOSE) return BoxerCard(disc_id, sent_id, word_ids, var, value, type) elif tok == 'whq': self.assertNextToken(DrtTokens.OPEN) disc_id = (self.token(), self.discourse_id)[self.discourse_id is not None] self.assertNextToken(DrtTokens.COMMA) sent_id = self.nullableIntToken() self.assertNextToken(DrtTokens.COMMA) word_ids = list(map(int, self.handle_refs())) self.assertNextToken(DrtTokens.COMMA) ans_types = self.handle_refs() self.assertNextToken(DrtTokens.COMMA) drs1 = self.process_next_expression(None) self.assertNextToken(DrtTokens.COMMA) var = int(self.token()) self.assertNextToken(DrtTokens.COMMA) drs2 = self.process_next_expression(None) self.assertNextToken(DrtTokens.CLOSE) return BoxerWhq(disc_id, sent_id, word_ids, ans_types, drs1, var, drs2) except Exception as e: raise LogicalExpressionException(self._currentIndex, str(e)) assert False, repr(tok) def nullableIntToken(self): t = self.token() return [None,int(t)][t != 'None'] def get_next_token_variable(self, description): try: return self.token() except ExpectedMoreTokensException as e: raise ExpectedMoreTokensException(e.index, 'Variable expected.') class AbstractBoxerDrs(object): def variables(self): """ :return: (set, set, set) """ variables, events, propositions = self._variables() return (variables - (events | propositions), events, propositions - events) def variable_types(self): vartypes = {} for t,vars in zip(('z','e','p'), self.variables()): for v in vars: vartypes[v] = t return vartypes def _variables(self): """ :return: (set, set, set) """ return (set(), set(), set()) def atoms(self): return set() def clean(self): return self def _clean_name(self, name): return name.replace('-','_').replace("'", "_") def renumber_sentences(self, f): return self def __hash__(self): return hash("%s" % self) @python_2_unicode_compatible class BoxerDrs(AbstractBoxerDrs): def __init__(self, refs, conds, consequent=None): AbstractBoxerDrs.__init__(self) self.refs = refs self.conds = conds self.consequent = consequent def _variables(self): variables = (set(), set(), set()) for cond in self.conds: for s,v in zip(variables, cond._variables()): s.update(v) if self.consequent is not None: for s,v in zip(variables, self.consequent._variables()): s.update(v) return variables def atoms(self): atoms = reduce(operator.or_, (cond.atoms() for cond in self.conds), set()) if self.consequent is not None: atoms.update(self.consequent.atoms()) return atoms def clean(self): consequent = (self.consequent.clean() if self.consequent else None) return BoxerDrs(self.refs, [c.clean() for c in self.conds], consequent) def renumber_sentences(self, f): consequent = (self.consequent.renumber_sentences(f) if self.consequent else None) return BoxerDrs(self.refs, [c.renumber_sentences(f) for c in self.conds], consequent) def __repr__(self): s = 'drs([%s], [%s])' % (', '.join("%s" % r for r in self.refs), ', '.join("%s" % c for c in self.conds)) if self.consequent is not None: s = 'imp(%s, %s)' % (s, self.consequent) return s def __eq__(self, other): return self.__class__ == other.__class__ and \ self.refs == other.refs and \ len(self.conds) == len(other.conds) and \ reduce(operator.and_, (c1==c2 for c1,c2 in zip(self.conds, other.conds))) and \ self.consequent == other.consequent def __ne__(self, other): return not self == other __hash__ = AbstractBoxerDrs.__hash__ @python_2_unicode_compatible class BoxerNot(AbstractBoxerDrs): def __init__(self, drs): AbstractBoxerDrs.__init__(self) self.drs = drs def _variables(self): return self.drs._variables() def atoms(self): return self.drs.atoms() def clean(self): return BoxerNot(self.drs.clean()) def renumber_sentences(self, f): return BoxerNot(self.drs.renumber_sentences(f)) def __repr__(self): return 'not(%s)' % (self.drs) def __eq__(self, other): return self.__class__ == other.__class__ and self.drs == other.drs def __ne__(self, other): return not self == other __hash__ = AbstractBoxerDrs.__hash__ @python_2_unicode_compatible class BoxerIndexed(AbstractBoxerDrs): def __init__(self, discourse_id, sent_index, word_indices): AbstractBoxerDrs.__init__(self) self.discourse_id = discourse_id self.sent_index = sent_index self.word_indices = word_indices def atoms(self): return set([self]) def __eq__(self, other): return self.__class__ == other.__class__ and \ self.discourse_id == other.discourse_id and \ self.sent_index == other.sent_index and \ self.word_indices == other.word_indices and \ reduce(operator.and_, (s==o for s,o in zip(self, other))) def __ne__(self, other): return not self == other __hash__ = AbstractBoxerDrs.__hash__ def __repr__(self): s = '%s(%s, %s, [%s]' % (self._pred(), self.discourse_id, self.sent_index, ', '.join("%s" % wi for wi in self.word_indices)) for v in self: s += ', %s' % v return s + ')' class BoxerPred(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var, name, pos, sense): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.name = name self.pos = pos self.sense = sense def _variables(self): return (set([self.var]), set(), set()) def change_var(self, var): return BoxerPred(self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.pos, self.sense) def clean(self): return BoxerPred(self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.pos, self.sense) def renumber_sentences(self, f): new_sent_index = f(self.sent_index) return BoxerPred(self.discourse_id, new_sent_index, self.word_indices, self.var, self.name, self.pos, self.sense) def __iter__(self): return iter((self.var, self.name, self.pos, self.sense)) def _pred(self): return 'pred' class BoxerNamed(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var, name, type, sense): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.name = name self.type = type self.sense = sense def _variables(self): return (set([self.var]), set(), set()) def change_var(self, var): return BoxerNamed(self.discourse_id, self.sent_index, self.word_indices, var, self.name, self.type, self.sense) def clean(self): return BoxerNamed(self.discourse_id, self.sent_index, self.word_indices, self.var, self._clean_name(self.name), self.type, self.sense) def renumber_sentences(self, f): return BoxerNamed(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.name, self.type, self.sense) def __iter__(self): return iter((self.var, self.name, self.type, self.sense)) def _pred(self): return 'named' class BoxerRel(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var1, var2, rel, sense): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var1 = var1 self.var2 = var2 self.rel = rel self.sense = sense def _variables(self): return (set([self.var1, self.var2]), set(), set()) def clean(self): return BoxerRel(self.discourse_id, self.sent_index, self.word_indices, self.var1, self.var2, self._clean_name(self.rel), self.sense) def renumber_sentences(self, f): return BoxerRel(self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2, self.rel, self.sense) def __iter__(self): return iter((self.var1, self.var2, self.rel, self.sense)) def _pred(self): return 'rel' class BoxerProp(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var, drs): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.drs = drs def _variables(self): return tuple(map(operator.or_, (set(), set(), set([self.var])), self.drs._variables())) def referenced_labels(self): return set([self.drs]) def atoms(self): return self.drs.atoms() def clean(self): return BoxerProp(self.discourse_id, self.sent_index, self.word_indices, self.var, self.drs.clean()) def renumber_sentences(self, f): return BoxerProp(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.drs.renumber_sentences(f)) def __iter__(self): return iter((self.var, self.drs)) def _pred(self): return 'prop' class BoxerEq(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var1, var2): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var1 = var1 self.var2 = var2 def _variables(self): return (set([self.var1, self.var2]), set(), set()) def atoms(self): return set() def renumber_sentences(self, f): return BoxerEq(self.discourse_id, f(self.sent_index), self.word_indices, self.var1, self.var2) def __iter__(self): return iter((self.var1, self.var2)) def _pred(self): return 'eq' class BoxerCard(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, var, value, type): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.var = var self.value = value self.type = type def _variables(self): return (set([self.var]), set(), set()) def renumber_sentences(self, f): return BoxerCard(self.discourse_id, f(self.sent_index), self.word_indices, self.var, self.value, self.type) def __iter__(self): return iter((self.var, self.value, self.type)) def _pred(self): return 'card' class BoxerOr(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, drs1, drs2): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.drs1 = drs1 self.drs2 = drs2 def _variables(self): return tuple(map(operator.or_, self.drs1._variables(), self.drs2._variables())) def atoms(self): return self.drs1.atoms() | self.drs2.atoms() def clean(self): return BoxerOr(self.discourse_id, self.sent_index, self.word_indices, self.drs1.clean(), self.drs2.clean()) def renumber_sentences(self, f): return BoxerOr(self.discourse_id, f(self.sent_index), self.word_indices, self.drs1, self.drs2) def __iter__(self): return iter((self.drs1, self.drs2)) def _pred(self): return 'or' class BoxerWhq(BoxerIndexed): def __init__(self, discourse_id, sent_index, word_indices, ans_types, drs1, variable, drs2): BoxerIndexed.__init__(self, discourse_id, sent_index, word_indices) self.ans_types = ans_types self.drs1 = drs1 self.variable = variable self.drs2 = drs2 def _variables(self): return tuple(map(operator.or_, (set([self.variable]), set(), set()), self.drs1._variables(), self.drs2._variables())) def atoms(self): return self.drs1.atoms() | self.drs2.atoms() def clean(self): return BoxerWhq(self.discourse_id, self.sent_index, self.word_indices, self.ans_types, self.drs1.clean(), self.variable, self.drs2.clean()) def renumber_sentences(self, f): return BoxerWhq(self.discourse_id, f(self.sent_index), self.word_indices, self.ans_types, self.drs1, self.variable, self.drs2) def __iter__(self): return iter(('['+','.join(self.ans_types)+']', self.drs1, self.variable, self.drs2)) def _pred(self): return 'whq' class PassthroughBoxerDrsInterpreter(object): def interpret(self, ex): return ex class NltkDrtBoxerDrsInterpreter(object): def __init__(self, occur_index=False): self._occur_index = occur_index def interpret(self, ex): """ :param ex: ``AbstractBoxerDrs`` :return: ``DrtExpression`` """ if isinstance(ex, BoxerDrs): drs = DRS([Variable(r) for r in ex.refs], list(map(self.interpret, ex.conds))) if ex.consequent is not None: drs.consequent = self.interpret(ex.consequent) return drs elif isinstance(ex, BoxerNot): return DrtNegatedExpression(self.interpret(ex.drs)) elif isinstance(ex, BoxerPred): pred = self._add_occur_indexing('%s_%s' % (ex.pos, ex.name), ex) return self._make_atom(pred, ex.var) elif isinstance(ex, BoxerNamed): pred = self._add_occur_indexing('ne_%s_%s' % (ex.type, ex.name), ex) return self._make_atom(pred, ex.var) elif isinstance(ex, BoxerRel): pred = self._add_occur_indexing('%s' % (ex.rel), ex) return self._make_atom(pred, ex.var1, ex.var2) elif isinstance(ex, BoxerProp): return DrtProposition(Variable(ex.var), self.interpret(ex.drs)) elif isinstance(ex, BoxerEq): return DrtEqualityExpression(DrtVariableExpression(Variable(ex.var1)), DrtVariableExpression(Variable(ex.var2))) elif isinstance(ex, BoxerCard): pred = self._add_occur_indexing('card_%s_%s' % (ex.type, ex.value), ex) return self._make_atom(pred, ex.var) elif isinstance(ex, BoxerOr): return DrtOrExpression(self.interpret(ex.drs1), self.interpret(ex.drs2)) elif isinstance(ex, BoxerWhq): drs1 = self.interpret(ex.drs1) drs2 = self.interpret(ex.drs2) return DRS(drs1.refs + drs2.refs, drs1.conds + drs2.conds) assert False, '%s: %s' % (ex.__class__.__name__, ex) def _make_atom(self, pred, *args): accum = DrtVariableExpression(Variable(pred)) for arg in args: accum = DrtApplicationExpression(accum, DrtVariableExpression(Variable(arg))) return accum def _add_occur_indexing(self, base, ex): if self._occur_index and ex.sent_index is not None: if ex.discourse_id: base += '_%s' % ex.discourse_id base += '_s%s' % ex.sent_index base += '_w%s' % sorted(ex.word_indices)[0] return base class UnparseableInputException(Exception): pass if __name__ == '__main__': opts = OptionParser("usage: %prog TEXT [options]") opts.add_option("--verbose", "-v", help="display verbose logs", action="store_true", default=False, dest="verbose") opts.add_option("--fol", "-f", help="output FOL", action="store_true", default=False, dest="fol") opts.add_option("--question", "-q", help="input is a question", action="store_true", default=False, dest="question") opts.add_option("--occur", "-o", help="occurrence index", action="store_true", default=False, dest="occur_index") (options, args) = opts.parse_args() if len(args) != 1: opts.error("incorrect number of arguments") interpreter = NltkDrtBoxerDrsInterpreter(occur_index=options.occur_index) drs = Boxer(interpreter).interpret_multi(args[0].split(r'\n'), question=options.question, verbose=options.verbose) if drs is None: print(None) else: drs = drs.simplify().eliminate_equality() if options.fol: print(drs.fol().normalize()) else: drs.pretty_print() nltk-3.1/nltk/sem/chat80.py0000644000076500000240000006224112607224144015260 0ustar sbstaff00000000000000# Natural Language Toolkit: Chat-80 KB Reader # See http://www.w3.org/TR/swbp-skos-core-guide/ # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein , # URL: # For license information, see LICENSE.TXT """ Overview ======== Chat-80 was a natural language system which allowed the user to interrogate a Prolog knowledge base in the domain of world geography. It was developed in the early '80s by Warren and Pereira; see ``http://www.aclweb.org/anthology/J82-3002.pdf`` for a description and ``http://www.cis.upenn.edu/~pereira/oldies.html`` for the source files. This module contains functions to extract data from the Chat-80 relation files ('the world database'), and convert then into a format that can be incorporated in the FOL models of ``nltk.sem.evaluate``. The code assumes that the Prolog input files are available in the NLTK corpora directory. The Chat-80 World Database consists of the following files:: world0.pl rivers.pl cities.pl countries.pl contain.pl borders.pl This module uses a slightly modified version of ``world0.pl``, in which a set of Prolog rules have been omitted. The modified file is named ``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since it uses a list rather than a string in the second field. Reading Chat-80 Files ===================== Chat-80 relations are like tables in a relational database. The relation acts as the name of the table; the first argument acts as the 'primary key'; and subsequent arguments are further fields in the table. In general, the name of the table provides a label for a unary predicate whose extension is all the primary keys. For example, relations in ``cities.pl`` are of the following form:: 'city(athens,greece,1368).' Here, ``'athens'`` is the key, and will be mapped to a member of the unary predicate *city*. The fields in the table are mapped to binary predicates. The first argument of the predicate is the primary key, while the second argument is the data in the relevant field. Thus, in the above example, the third field is mapped to the binary predicate *population_of*, whose extension is a set of pairs such as ``'(athens, 1368)'``. An exception to this general framework is required by the relations in the files ``borders.pl`` and ``contains.pl``. These contain facts of the following form:: 'borders(albania,greece).' 'contains0(africa,central_africa).' We do not want to form a unary concept out the element in the first field of these records, and we want the label of the binary relation just to be ``'border'``/``'contain'`` respectively. In order to drive the extraction process, we use 'relation metadata bundles' which are Python dictionaries such as the following:: city = {'label': 'city', 'closures': [], 'schema': ['city', 'country', 'population'], 'filename': 'cities.pl'} According to this, the file ``city['filename']`` contains a list of relational tuples (or more accurately, the corresponding strings in Prolog form) whose predicate symbol is ``city['label']`` and whose relational schema is ``city['schema']``. The notion of a ``closure`` is discussed in the next section. Concepts ======== In order to encapsulate the results of the extraction, a class of ``Concept`` objects is introduced. A ``Concept`` object has a number of attributes, in particular a ``prefLabel`` and ``extension``, which make it easier to inspect the output of the extraction. In addition, the ``extension`` can be further processed: in the case of the ``'border'`` relation, we check that the relation is symmetric, and in the case of the ``'contain'`` relation, we carry out the transitive closure. The closure properties associated with a concept is indicated in the relation metadata, as indicated earlier. The ``extension`` of a ``Concept`` object is then incorporated into a ``Valuation`` object. Persistence =========== The functions ``val_dump`` and ``val_load`` are provided to allow a valuation to be stored in a persistent database and re-loaded, rather than having to be re-computed each time. Individuals and Lexical Items ============================= As well as deriving relations from the Chat-80 data, we also create a set of individual constants, one for each entity in the domain. The individual constants are string-identical to the entities. For example, given a data item such as ``'zloty'``, we add to the valuation a pair ``('zloty', 'zloty')``. In order to parse English sentences that refer to these entities, we also create a lexical item such as the following for each individual constant:: PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty' The set of rules is written to the file ``chat_pnames.cfg`` in the current directory. """ from __future__ import print_function, unicode_literals import re import shelve import os import sys import nltk.data from nltk.compat import string_types, python_2_unicode_compatible ########################################################################### # Chat-80 relation metadata bundles needed to build the valuation ########################################################################### borders = {'rel_name': 'borders', 'closures': ['symmetric'], 'schema': ['region', 'border'], 'filename': 'borders.pl'} contains = {'rel_name': 'contains0', 'closures': ['transitive'], 'schema': ['region', 'contain'], 'filename': 'contain.pl'} city = {'rel_name': 'city', 'closures': [], 'schema': ['city', 'country', 'population'], 'filename': 'cities.pl'} country = {'rel_name': 'country', 'closures': [], 'schema': ['country', 'region', 'latitude', 'longitude', 'area', 'population', 'capital', 'currency'], 'filename': 'countries.pl'} circle_of_lat = {'rel_name': 'circle_of_latitude', 'closures': [], 'schema': ['circle_of_latitude', 'degrees'], 'filename': 'world1.pl'} circle_of_long = {'rel_name': 'circle_of_longitude', 'closures': [], 'schema': ['circle_of_longitude', 'degrees'], 'filename': 'world1.pl'} continent = {'rel_name': 'continent', 'closures': [], 'schema': ['continent'], 'filename': 'world1.pl'} region = {'rel_name': 'in_continent', 'closures': [], 'schema': ['region', 'continent'], 'filename': 'world1.pl'} ocean = {'rel_name': 'ocean', 'closures': [], 'schema': ['ocean'], 'filename': 'world1.pl'} sea = {'rel_name': 'sea', 'closures': [], 'schema': ['sea'], 'filename': 'world1.pl'} items = ['borders', 'contains', 'city', 'country', 'circle_of_lat', 'circle_of_long', 'continent', 'region', 'ocean', 'sea'] items = tuple(sorted(items)) item_metadata = { 'borders': borders, 'contains': contains, 'city': city, 'country': country, 'circle_of_lat': circle_of_lat, 'circle_of_long': circle_of_long, 'continent': continent, 'region': region, 'ocean': ocean, 'sea': sea } rels = item_metadata.values() not_unary = ['borders.pl', 'contain.pl'] ########################################################################### @python_2_unicode_compatible class Concept(object): """ A Concept class, loosely based on SKOS (http://www.w3.org/TR/swbp-skos-core-guide/). """ def __init__(self, prefLabel, arity, altLabels=[], closures=[], extension=set()): """ :param prefLabel: the preferred label for the concept :type prefLabel: str :param arity: the arity of the concept :type arity: int @keyword altLabels: other (related) labels :type altLabels: list @keyword closures: closure properties of the extension \ (list items can be ``symmetric``, ``reflexive``, ``transitive``) :type closures: list @keyword extension: the extensional value of the concept :type extension: set """ self.prefLabel = prefLabel self.arity = arity self.altLabels = altLabels self.closures = closures #keep _extension internally as a set self._extension = extension #public access is via a list (for slicing) self.extension = sorted(list(extension)) def __str__(self): #_extension = '' #for element in sorted(self.extension): #if isinstance(element, tuple): #element = '(%s, %s)' % (element) #_extension += element + ', ' #_extension = _extension[:-1] return "Label = '%s'\nArity = %s\nExtension = %s" % \ (self.prefLabel, self.arity, self.extension) def __repr__(self): return "Concept('%s')" % self.prefLabel def augment(self, data): """ Add more data to the ``Concept``'s extension set. :param data: a new semantic value :type data: string or pair of strings :rtype: set """ self._extension.add(data) self.extension = sorted(list(self._extension)) return self._extension def _make_graph(self, s): """ Convert a set of pairs into an adjacency linked list encoding of a graph. """ g = {} for (x, y) in s: if x in g: g[x].append(y) else: g[x] = [y] return g def _transclose(self, g): """ Compute the transitive closure of a graph represented as a linked list. """ for x in g: for adjacent in g[x]: # check that adjacent is a key if adjacent in g: for y in g[adjacent]: if y not in g[x]: g[x].append(y) return g def _make_pairs(self, g): """ Convert an adjacency linked list back into a set of pairs. """ pairs = [] for node in g: for adjacent in g[node]: pairs.append((node, adjacent)) return set(pairs) def close(self): """ Close a binary relation in the ``Concept``'s extension set. :return: a new extension for the ``Concept`` in which the relation is closed under a given property """ from nltk.sem import is_rel assert is_rel(self._extension) if 'symmetric' in self.closures: pairs = [] for (x, y) in self._extension: pairs.append((y, x)) sym = set(pairs) self._extension = self._extension.union(sym) if 'transitive' in self.closures: all = self._make_graph(self._extension) closed = self._transclose(all) trans = self._make_pairs(closed) #print sorted(trans) self._extension = self._extension.union(trans) self.extension = sorted(list(self._extension)) def clause2concepts(filename, rel_name, schema, closures=[]): """ Convert a file of Prolog clauses into a list of ``Concept`` objects. :param filename: filename containing the relations :type filename: str :param rel_name: name of the relation :type rel_name: str :param schema: the schema used in a set of relational tuples :type schema: list :param closures: closure properties for the extension of the concept :type closures: list :return: a list of ``Concept`` objects :rtype: list """ concepts = [] # position of the subject of a binary relation subj = 0 # label of the 'primary key' pkey = schema[0] # fields other than the primary key fields = schema[1:] # convert a file into a list of lists records = _str2records(filename, rel_name) # add a unary concept corresponding to the set of entities # in the primary key position # relations in 'not_unary' are more like ordinary binary relations if not filename in not_unary: concepts.append(unary_concept(pkey, subj, records)) # add a binary concept for each non-key field for field in fields: obj = schema.index(field) concepts.append(binary_concept(field, closures, subj, obj, records)) return concepts def cities2table(filename, rel_name, dbname, verbose=False, setup=False): """ Convert a file of Prolog clauses into a database table. This is not generic, since it doesn't allow arbitrary schemas to be set as a parameter. Intended usage:: cities2table('cities.pl', 'city', 'city.db', verbose=True, setup=True) :param filename: filename containing the relations :type filename: str :param rel_name: name of the relation :type rel_name: str :param dbname: filename of persistent store :type schema: str """ import sqlite3 records = _str2records(filename, rel_name) connection = sqlite3.connect(dbname) cur = connection.cursor() if setup: cur.execute('''CREATE TABLE city_table (City text, Country text, Population int)''') table_name = "city_table" for t in records: cur.execute('insert into %s values (?,?,?)' % table_name, t) if verbose: print("inserting values into %s: " % table_name, t) connection.commit() if verbose: print("Committing update to %s" % dbname) cur.close() def sql_query(dbname, query): """ Execute an SQL query over a database. :param dbname: filename of persistent store :type schema: str :param query: SQL query :type rel_name: str """ import sqlite3 try: path = nltk.data.find(dbname) connection = sqlite3.connect(str(path)) cur = connection.cursor() return cur.execute(query) except (ValueError, sqlite3.OperationalError): import warnings warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname) raise def _str2records(filename, rel): """ Read a file into memory and convert each relation clause into a list. """ recs = [] contents = nltk.data.load("corpora/chat80/%s" % filename, format="text") for line in contents.splitlines(): if line.startswith(rel): line = re.sub(rel+r'\(', '', line) line = re.sub(r'\)\.$', '', line) record = line.split(',') recs.append(record) return recs def unary_concept(label, subj, records): """ Make a unary concept out of the primary key in a record. A record is a list of entities in some relation, such as ``['france', 'paris']``, where ``'france'`` is acting as the primary key. :param label: the preferred label for the concept :type label: string :param subj: position in the record of the subject of the predicate :type subj: int :param records: a list of records :type records: list of lists :return: ``Concept`` of arity 1 :rtype: Concept """ c = Concept(label, arity=1, extension=set()) for record in records: c.augment(record[subj]) return c def binary_concept(label, closures, subj, obj, records): """ Make a binary concept out of the primary key and another field in a record. A record is a list of entities in some relation, such as ``['france', 'paris']``, where ``'france'`` is acting as the primary key, and ``'paris'`` stands in the ``'capital_of'`` relation to ``'france'``. More generally, given a record such as ``['a', 'b', 'c']``, where label is bound to ``'B'``, and ``obj`` bound to 1, the derived binary concept will have label ``'B_of'``, and its extension will be a set of pairs such as ``('a', 'b')``. :param label: the base part of the preferred label for the concept :type label: str :param closures: closure properties for the extension of the concept :type closures: list :param subj: position in the record of the subject of the predicate :type subj: int :param obj: position in the record of the object of the predicate :type obj: int :param records: a list of records :type records: list of lists :return: ``Concept`` of arity 2 :rtype: Concept """ if not label == 'border' and not label == 'contain': label = label + '_of' c = Concept(label, arity=2, closures=closures, extension=set()) for record in records: c.augment((record[subj], record[obj])) # close the concept's extension according to the properties in closures c.close() return c def process_bundle(rels): """ Given a list of relation metadata bundles, make a corresponding dictionary of concepts, indexed by the relation name. :param rels: bundle of metadata needed for constructing a concept :type rels: list of dict :return: a dictionary of concepts, indexed by the relation name. :rtype: dict """ concepts = {} for rel in rels: rel_name = rel['rel_name'] closures = rel['closures'] schema = rel['schema'] filename = rel['filename'] concept_list = clause2concepts(filename, rel_name, schema, closures) for c in concept_list: label = c.prefLabel if (label in concepts): for data in c.extension: concepts[label].augment(data) concepts[label].close() else: concepts[label] = c return concepts def make_valuation(concepts, read=False, lexicon=False): """ Convert a list of ``Concept`` objects into a list of (label, extension) pairs; optionally create a ``Valuation`` object. :param concepts: concepts :type concepts: list(Concept) :param read: if ``True``, ``(symbol, set)`` pairs are read into a ``Valuation`` :type read: bool :rtype: list or Valuation """ vals = [] for c in concepts: vals.append((c.prefLabel, c.extension)) if lexicon: read = True if read: from nltk.sem import Valuation val = Valuation({}) val.update(vals) # add labels for individuals val = label_indivs(val, lexicon=lexicon) return val else: return vals def val_dump(rels, db): """ Make a ``Valuation`` from a list of relation metadata bundles and dump to persistent database. :param rels: bundle of metadata needed for constructing a concept :type rels: list of dict :param db: name of file to which data is written. The suffix '.db' will be automatically appended. :type db: string """ concepts = process_bundle(rels).values() valuation = make_valuation(concepts, read=True) db_out = shelve.open(db, 'n') db_out.update(valuation) db_out.close() def val_load(db): """ Load a ``Valuation`` from a persistent database. :param db: name of file from which data is read. The suffix '.db' should be omitted from the name. :type db: string """ dbname = db+".db" if not os.access(dbname, os.R_OK): sys.exit("Cannot read file: %s" % dbname) else: db_in = shelve.open(db) from nltk.sem import Valuation val = Valuation(db_in) # val.read(db_in.items()) return val #def alpha(str): #""" #Utility to filter out non-alphabetic constants. #:param str: candidate constant #:type str: string #:rtype: bool #""" #try: #int(str) #return False #except ValueError: ## some unknown values in records are labeled '?' #if not str == '?': #return True def label_indivs(valuation, lexicon=False): """ Assign individual constants to the individuals in the domain of a ``Valuation``. Given a valuation with an entry of the form ``{'rel': {'a': True}}``, add a new entry ``{'a': 'a'}``. :type valuation: Valuation :rtype: Valuation """ # collect all the individuals into a domain domain = valuation.domain # convert the domain into a sorted list of alphabetic terms # use the same string as a label pairs = [(e, e) for e in domain] if lexicon: lex = make_lex(domain) with open("chat_pnames.cfg", 'w') as outfile: outfile.writelines(lex) # read the pairs into the valuation valuation.update(pairs) return valuation def make_lex(symbols): """ Create lexical CFG rules for each individual symbol. Given a valuation with an entry of the form ``{'zloty': 'zloty'}``, create a lexical rule for the proper name 'Zloty'. :param symbols: a list of individual constants in the semantic representation :type symbols: sequence :rtype: list """ lex = [] header = """ ################################################################## # Lexical rules automatically generated by running 'chat80.py -x'. ################################################################## """ lex.append(header) template = "PropN[num=sg, sem=<\P.(P %s)>] -> '%s'\n" for s in symbols: parts = s.split('_') caps = [p.capitalize() for p in parts] pname = '_'.join(caps) rule = template % (s, pname) lex.append(rule) return lex ########################################################################### # Interface function to emulate other corpus readers ########################################################################### def concepts(items = items): """ Build a list of concepts corresponding to the relation names in ``items``. :param items: names of the Chat-80 relations to extract :type items: list of strings :return: the ``Concept`` objects which are extracted from the relations :rtype: list """ if isinstance(items, string_types): items = (items,) rels = [item_metadata[r] for r in items] concept_map = process_bundle(rels) return concept_map.values() ########################################################################### def main(): import sys from optparse import OptionParser description = \ """ Extract data from the Chat-80 Prolog files and convert them into a Valuation object for use in the NLTK semantics package. """ opts = OptionParser(description=description) opts.set_defaults(verbose=True, lex=False, vocab=False) opts.add_option("-s", "--store", dest="outdb", help="store a valuation in DB", metavar="DB") opts.add_option("-l", "--load", dest="indb", help="load a stored valuation from DB", metavar="DB") opts.add_option("-c", "--concepts", action="store_true", help="print concepts instead of a valuation") opts.add_option("-r", "--relation", dest="label", help="print concept with label REL (check possible labels with '-v' option)", metavar="REL") opts.add_option("-q", "--quiet", action="store_false", dest="verbose", help="don't print out progress info") opts.add_option("-x", "--lex", action="store_true", dest="lex", help="write a file of lexical entries for country names, then exit") opts.add_option("-v", "--vocab", action="store_true", dest="vocab", help="print out the vocabulary of concept labels and their arity, then exit") (options, args) = opts.parse_args() if options.outdb and options.indb: opts.error("Options --store and --load are mutually exclusive") if options.outdb: # write the valuation to a persistent database if options.verbose: outdb = options.outdb+".db" print("Dumping a valuation to %s" % outdb) val_dump(rels, options.outdb) sys.exit(0) else: # try to read in a valuation from a database if options.indb is not None: dbname = options.indb+".db" if not os.access(dbname, os.R_OK): sys.exit("Cannot read file: %s" % dbname) else: valuation = val_load(options.indb) # we need to create the valuation from scratch else: # build some concepts concept_map = process_bundle(rels) concepts = concept_map.values() # just print out the vocabulary if options.vocab: items = sorted([(c.arity, c.prefLabel) for c in concepts]) for (arity, label) in items: print(label, arity) sys.exit(0) # show all the concepts if options.concepts: for c in concepts: print(c) print() if options.label: print(concept_map[options.label]) sys.exit(0) else: # turn the concepts into a Valuation if options.lex: if options.verbose: print("Writing out lexical rules") make_valuation(concepts, lexicon=True) else: valuation = make_valuation(concepts, read=True) print(valuation) def sql_demo(): """ Print out every row from the 'city.db' database. """ print() print("Using SQL to extract rows from 'city.db' RDB.") for row in sql_query('corpora/city_database/city.db', "SELECT * FROM city_table"): print(row) if __name__ == '__main__': main() sql_demo() nltk-3.1/nltk/sem/cooper_storage.py0000644000076500000240000000774312607224144017212 0ustar sbstaff00000000000000# Natural Language Toolkit: Cooper storage for Quantifier Ambiguity # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT from __future__ import print_function from nltk.sem.logic import LambdaExpression, ApplicationExpression, Variable from nltk.parse import load_parser from nltk.parse.featurechart import InstantiateVarsChart class CooperStore(object): """ A container for handling quantifier ambiguity via Cooper storage. """ def __init__(self, featstruct): """ :param featstruct: The value of the ``sem`` node in a tree from ``parse_with_bindops()`` :type featstruct: FeatStruct (with features ``core`` and ``store``) """ self.featstruct = featstruct self.readings = [] try: self.core = featstruct['CORE'] self.store = featstruct['STORE'] except KeyError: print("%s is not a Cooper storage structure" % featstruct) def _permute(self, lst): """ :return: An iterator over the permutations of the input list :type lst: list :rtype: iter """ remove = lambda lst0, index: lst0[:index] + lst0[index+1:] if lst: for index, x in enumerate(lst): for y in self._permute(remove(lst, index)): yield (x,)+y else: yield () def s_retrieve(self, trace=False): """ Carry out S-Retrieval of binding operators in store. If hack=True, serialize the bindop and core as strings and reparse. Ugh. Each permutation of the store (i.e. list of binding operators) is taken to be a possible scoping of quantifiers. We iterate through the binding operators in each permutation, and successively apply them to the current term, starting with the core semantic representation, working from the inside out. Binding operators are of the form:: bo(\P.all x.(man(x) -> P(x)),z1) """ for perm, store_perm in enumerate(self._permute(self.store)): if trace: print("Permutation %s" % (perm+1)) term = self.core for bindop in store_perm: # we just want the arguments that are wrapped by the 'bo' predicate quant, varex = tuple(bindop.args) # use var to make an abstraction over the current term and then # apply the quantifier to it term = ApplicationExpression(quant, LambdaExpression(varex.variable, term)) if trace: print(" ", term) term = term.simplify() self.readings.append(term) def parse_with_bindops(sentence, grammar=None, trace=0): """ Use a grammar with Binding Operators to parse a sentence. """ if not grammar: grammar = 'grammars/book_grammars/storage.fcfg' parser = load_parser(grammar, trace=trace, chart_class=InstantiateVarsChart) # Parse the sentence. tokens = sentence.split() return list(parser.parse(tokens)) def demo(): from nltk.sem import cooper_storage as cs sentence = "every girl chases a dog" #sentence = "a man gives a bone to every dog" print() print("Analyis of sentence '%s'" % sentence) print("=" * 50) trees = cs.parse_with_bindops(sentence, trace=0) for tree in trees: semrep = cs.CooperStore(tree.label()['SEM']) print() print("Binding operators:") print("-" * 15) for s in semrep.store: print(s) print() print("Core:") print("-" * 15) print(semrep.core) print() print("S-Retrieval:") print("-" * 15) semrep.s_retrieve(trace=True) print("Readings:") print("-" * 15) for i, reading in enumerate(semrep.readings): print("%s: %s" % (i+1, reading)) if __name__ == '__main__': demo() nltk-3.1/nltk/sem/drt.py0000644000076500000240000014230512607224144014762 0ustar sbstaff00000000000000# Natural Language Toolkit: Discourse Representation Theory (DRT) # # Author: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals import operator from functools import reduce from nltk.compat import string_types, python_2_unicode_compatible from nltk.sem.logic import (APP, AbstractVariableExpression, AllExpression, AndExpression, ApplicationExpression, BinaryExpression, BooleanExpression, ConstantExpression, EqualityExpression, EventVariableExpression, ExistsExpression, Expression, FunctionVariableExpression, ImpExpression, IndividualVariableExpression, LambdaExpression, Tokens, LogicParser, NegatedExpression, OrExpression, Variable, is_eventvar, is_funcvar, is_indvar, unique_variable) # Import Tkinter-based modules if they are available try: # imports are fixed for Python 2.x by nltk.compat from tkinter import Canvas from tkinter import Tk from tkinter.font import Font from nltk.util import in_idle except ImportError: # No need to print a warning here, nltk.draw has already printed one. pass class DrtTokens(Tokens): DRS = 'DRS' DRS_CONC = '+' PRONOUN = 'PRO' OPEN_BRACKET = '[' CLOSE_BRACKET = ']' COLON = ':' PUNCT = [DRS_CONC, OPEN_BRACKET, CLOSE_BRACKET, COLON] SYMBOLS = Tokens.SYMBOLS + PUNCT TOKENS = Tokens.TOKENS + [DRS] + PUNCT class DrtParser(LogicParser): """A lambda calculus expression parser.""" def __init__(self): LogicParser.__init__(self) self.operator_precedence = dict( [(x,1) for x in DrtTokens.LAMBDA_LIST] + \ [(x,2) for x in DrtTokens.NOT_LIST] + \ [(APP,3)] + \ [(x,4) for x in DrtTokens.EQ_LIST+Tokens.NEQ_LIST] + \ [(DrtTokens.COLON,5)] + \ [(DrtTokens.DRS_CONC,6)] + \ [(x,7) for x in DrtTokens.OR_LIST] + \ [(x,8) for x in DrtTokens.IMP_LIST] + \ [(None,9)]) def get_all_symbols(self): """This method exists to be overridden""" return DrtTokens.SYMBOLS def isvariable(self, tok): return tok not in DrtTokens.TOKENS def handle(self, tok, context): """This method is intended to be overridden for logics that use different operators or expressions""" if tok in DrtTokens.NOT_LIST: return self.handle_negation(tok, context) elif tok in DrtTokens.LAMBDA_LIST: return self.handle_lambda(tok, context) elif tok == DrtTokens.OPEN: if self.inRange(0) and self.token(0) == DrtTokens.OPEN_BRACKET: return self.handle_DRS(tok, context) else: return self.handle_open(tok, context) elif tok.upper() == DrtTokens.DRS: self.assertNextToken(DrtTokens.OPEN) return self.handle_DRS(tok, context) elif self.isvariable(tok): if self.inRange(0) and self.token(0) == DrtTokens.COLON: return self.handle_prop(tok, context) else: return self.handle_variable(tok, context) def make_NegatedExpression(self, expression): return DrtNegatedExpression(expression) def handle_DRS(self, tok, context): # a DRS refs = self.handle_refs() if self.inRange(0) and self.token(0) == DrtTokens.COMMA: #if there is a comma (it's optional) self.token() # swallow the comma conds = self.handle_conds(context) self.assertNextToken(DrtTokens.CLOSE) return DRS(refs, conds, None) def handle_refs(self): self.assertNextToken(DrtTokens.OPEN_BRACKET) refs = [] while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET: # Support expressions like: DRS([x y],C) == DRS([x,y],C) if refs and self.token(0) == DrtTokens.COMMA: self.token() # swallow the comma refs.append(self.get_next_token_variable('quantified')) self.assertNextToken(DrtTokens.CLOSE_BRACKET) return refs def handle_conds(self, context): self.assertNextToken(DrtTokens.OPEN_BRACKET) conds = [] while self.inRange(0) and self.token(0) != DrtTokens.CLOSE_BRACKET: # Support expressions like: DRS([x y],C) == DRS([x, y],C) if conds and self.token(0) == DrtTokens.COMMA: self.token() # swallow the comma conds.append(self.process_next_expression(context)) self.assertNextToken(DrtTokens.CLOSE_BRACKET) return conds def handle_prop(self, tok, context): variable = self.make_VariableExpression(tok) self.assertNextToken(':') drs = self.process_next_expression(DrtTokens.COLON) return DrtProposition(variable, drs) def make_EqualityExpression(self, first, second): """This method serves as a hook for other logic parsers that have different equality expression classes""" return DrtEqualityExpression(first, second) def get_BooleanExpression_factory(self, tok): """This method serves as a hook for other logic parsers that have different boolean operators""" if tok == DrtTokens.DRS_CONC: return lambda first, second: DrtConcatenation(first, second, None) elif tok in DrtTokens.OR_LIST: return DrtOrExpression elif tok in DrtTokens.IMP_LIST: def make_imp_expression(first, second): if isinstance(first, DRS): return DRS(first.refs, first.conds, second) if isinstance(first, DrtConcatenation): return DrtConcatenation(first.first, first.second, second) raise Exception('Antecedent of implication must be a DRS') return make_imp_expression else: return None def make_BooleanExpression(self, factory, first, second): return factory(first, second) def make_ApplicationExpression(self, function, argument): return DrtApplicationExpression(function, argument) def make_VariableExpression(self, name): return DrtVariableExpression(Variable(name)) def make_LambdaExpression(self, variables, term): return DrtLambdaExpression(variables, term) class DrtExpression(object): """ This is the base abstract DRT Expression from which every DRT Expression extends. """ _drt_parser = DrtParser() @classmethod def fromstring(cls, s): return cls._drt_parser.parse(s) def applyto(self, other): return DrtApplicationExpression(self, other) def __neg__(self): return DrtNegatedExpression(self) def __and__(self, other): raise NotImplementedError() def __or__(self, other): assert isinstance(other, DrtExpression) return DrtOrExpression(self, other) def __gt__(self, other): assert isinstance(other, DrtExpression) if isinstance(self, DRS): return DRS(self.refs, self.conds, other) if isinstance(self, DrtConcatenation): return DrtConcatenation(self.first, self.second, other) raise Exception('Antecedent of implication must be a DRS') def equiv(self, other, prover=None): """ Check for logical equivalence. Pass the expression (self <-> other) to the theorem prover. If the prover says it is valid, then the self and other are equal. :param other: an ``DrtExpression`` to check equality against :param prover: a ``nltk.inference.api.Prover`` """ assert isinstance(other, DrtExpression) f1 = self.simplify().fol(); f2 = other.simplify().fol(); return f1.equiv(f2, prover) @property def type(self): raise AttributeError("'%s' object has no attribute 'type'" % self.__class__.__name__) def typecheck(self, signature=None): raise NotImplementedError() def __add__(self, other): return DrtConcatenation(self, other, None) def get_refs(self, recursive=False): """ Return the set of discourse referents in this DRS. :param recursive: bool Also find discourse referents in subterms? :return: list of ``Variable`` objects """ raise NotImplementedError() def is_pronoun_function(self): """ Is self of the form "PRO(x)"? """ return isinstance(self, DrtApplicationExpression) and \ isinstance(self.function, DrtAbstractVariableExpression) and \ self.function.variable.name == DrtTokens.PRONOUN and \ isinstance(self.argument, DrtIndividualVariableExpression) def make_EqualityExpression(self, first, second): return DrtEqualityExpression(first, second) def make_VariableExpression(self, variable): return DrtVariableExpression(variable) def resolve_anaphora(self): return resolve_anaphora(self) def eliminate_equality(self): return self.visit_structured(lambda e: e.eliminate_equality(), self.__class__) def pretty_format(self): """ Draw the DRS :return: the pretty print string """ return '\n'.join(self._pretty()) def pretty_print(self): print(self.pretty_format()) def draw(self): DrsDrawer(self).draw() @python_2_unicode_compatible class DRS(DrtExpression, Expression): """A Discourse Representation Structure.""" def __init__(self, refs, conds, consequent=None): """ :param refs: list of ``DrtIndividualVariableExpression`` for the discourse referents :param conds: list of ``Expression`` for the conditions """ self.refs = refs self.conds = conds self.consequent = consequent def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """Replace all instances of variable v with expression E in self, where v is free in self.""" if variable in self.refs: #if a bound variable is the thing being replaced if not replace_bound: return self else: i = self.refs.index(variable) if self.consequent: consequent = self.consequent.replace(variable, expression, True, alpha_convert) else: consequent = None return DRS(self.refs[:i]+[expression.variable]+self.refs[i+1:], [cond.replace(variable, expression, True, alpha_convert) for cond in self.conds], consequent) else: if alpha_convert: # any bound variable that appears in the expression must # be alpha converted to avoid a conflict for ref in (set(self.refs) & expression.free()): newvar = unique_variable(ref) newvarex = DrtVariableExpression(newvar) i = self.refs.index(ref) if self.consequent: consequent = self.consequent.replace(ref, newvarex, True, alpha_convert) else: consequent = None self = DRS(self.refs[:i]+[newvar]+self.refs[i+1:], [cond.replace(ref, newvarex, True, alpha_convert) for cond in self.conds], consequent) #replace in the conditions if self.consequent: consequent = self.consequent.replace(variable, expression, replace_bound, alpha_convert) else: consequent = None return DRS(self.refs, [cond.replace(variable, expression, replace_bound, alpha_convert) for cond in self.conds], consequent) def free(self): """:see: Expression.free()""" conds_free = reduce(operator.or_, [c.free() for c in self.conds], set()) if self.consequent: conds_free.update(self.consequent.free()) return conds_free - set(self.refs) def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" if recursive: conds_refs = self.refs + sum((c.get_refs(True) for c in self.conds), []) if self.consequent: conds_refs.extend(self.consequent.get_refs(True)) return conds_refs else: return self.refs def visit(self, function, combinator): """:see: Expression.visit()""" parts = list(map(function, self.conds)) if self.consequent: parts.append(function(self.consequent)) return combinator(parts) def visit_structured(self, function, combinator): """:see: Expression.visit_structured()""" consequent = (function(self.consequent) if self.consequent else None) return combinator(self.refs, list(map(function, self.conds)), consequent) def eliminate_equality(self): drs = self i = 0 while i < len(drs.conds): cond = drs.conds[i] if isinstance(cond, EqualityExpression) and \ isinstance(cond.first, AbstractVariableExpression) and \ isinstance(cond.second, AbstractVariableExpression): drs = DRS(list(set(drs.refs)-set([cond.second.variable])), drs.conds[:i]+drs.conds[i+1:], drs.consequent) if cond.second.variable != cond.first.variable: drs = drs.replace(cond.second.variable, cond.first, False, False) i = 0 i -= 1 i += 1 conds = [] for cond in drs.conds: new_cond = cond.eliminate_equality() new_cond_simp = new_cond.simplify() if not isinstance(new_cond_simp, DRS) or \ new_cond_simp.refs or new_cond_simp.conds or \ new_cond_simp.consequent: conds.append(new_cond) consequent = (drs.consequent.eliminate_equality() if drs.consequent else None) return DRS(drs.refs, conds, consequent) def fol(self): if self.consequent: accum = None if self.conds: accum = reduce(AndExpression, [c.fol() for c in self.conds]) if accum: accum = ImpExpression(accum, self.consequent.fol()) else: accum = self.consequent.fol() for ref in self.refs[::-1]: accum = AllExpression(ref, accum) return accum else: if not self.conds: raise Exception("Cannot convert DRS with no conditions to FOL.") accum = reduce(AndExpression, [c.fol() for c in self.conds]) for ref in map(Variable, self._order_ref_strings(self.refs)[::-1]): accum = ExistsExpression(ref, accum) return accum def _pretty(self): refs_line = ' '.join(self._order_ref_strings(self.refs)) cond_lines = [cond for cond_line in [filter(lambda s: s.strip(), cond._pretty()) for cond in self.conds] for cond in cond_line] length = max([len(refs_line)] + list(map(len, cond_lines))) drs = ([' _' + '_' * length + '_ ', '| ' + refs_line.ljust(length) + ' |', '|-' + '-' * length + '-|'] + ['| ' + line.ljust(length) + ' |' for line in cond_lines] + ['|_' + '_' * length + '_|']) if self.consequent: return DrtBinaryExpression._assemble_pretty(drs, DrtTokens.IMP, self.consequent._pretty()) return drs def _order_ref_strings(self, refs): strings = ["%s" % ref for ref in refs] ind_vars = [] func_vars = [] event_vars = [] other_vars = [] for s in strings: if is_indvar(s): ind_vars.append(s) elif is_funcvar(s): func_vars.append(s) elif is_eventvar(s): event_vars.append(s) else: other_vars.append(s) return sorted(other_vars) + \ sorted(event_vars, key=lambda v: int([v[2:],-1][len(v[2:]) == 0])) + \ sorted(func_vars, key=lambda v: (v[0], int([v[1:],-1][len(v[1:])==0]))) + \ sorted(ind_vars, key=lambda v: (v[0], int([v[1:],-1][len(v[1:])==0]))) def __eq__(self, other): r"""Defines equality modulo alphabetic variance. If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" if isinstance(other, DRS): if len(self.refs) == len(other.refs): converted_other = other for (r1, r2) in zip(self.refs, converted_other.refs): varex = self.make_VariableExpression(r1) converted_other = converted_other.replace(r2, varex, True) if self.consequent == converted_other.consequent and \ len(self.conds) == len(converted_other.conds): for c1, c2 in zip(self.conds, converted_other.conds): if not (c1 == c2): return False return True return False def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def __str__(self): drs = '([%s],[%s])' % (','.join(self._order_ref_strings(self.refs)), ', '.join("%s" % cond for cond in self.conds)) # map(str, self.conds))) if self.consequent: return DrtTokens.OPEN + drs + ' ' + DrtTokens.IMP + ' ' + \ "%s" % self.consequent + DrtTokens.CLOSE return drs def DrtVariableExpression(variable): """ This is a factory method that instantiates and returns a subtype of ``DrtAbstractVariableExpression`` appropriate for the given variable. """ if is_indvar(variable.name): return DrtIndividualVariableExpression(variable) elif is_funcvar(variable.name): return DrtFunctionVariableExpression(variable) elif is_eventvar(variable.name): return DrtEventVariableExpression(variable) else: return DrtConstantExpression(variable) class DrtAbstractVariableExpression(DrtExpression, AbstractVariableExpression): def fol(self): return self def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" return [] def _pretty(self): s = "%s" % self blank = ' '*len(s) return [blank, blank, s, blank] def eliminate_equality(self): return self class DrtIndividualVariableExpression(DrtAbstractVariableExpression, IndividualVariableExpression): pass class DrtFunctionVariableExpression(DrtAbstractVariableExpression, FunctionVariableExpression): pass class DrtEventVariableExpression(DrtIndividualVariableExpression, EventVariableExpression): pass class DrtConstantExpression(DrtAbstractVariableExpression, ConstantExpression): pass @python_2_unicode_compatible class DrtProposition(DrtExpression, Expression): def __init__(self, variable, drs): self.variable = variable self.drs = drs def replace(self, variable, expression, replace_bound=False, alpha_convert=True): if self.variable == variable: assert isinstance(expression, DrtAbstractVariableExpression), "Can only replace a proposition label with a variable" return DrtProposition(expression.variable, self.drs.replace(variable, expression, replace_bound, alpha_convert)) else: return DrtProposition(self.variable, self.drs.replace(variable, expression, replace_bound, alpha_convert)) def eliminate_equality(self): return DrtProposition(self.variable, self.drs.eliminate_equality()) def get_refs(self, recursive=False): return (self.drs.get_refs(True) if recursive else []) def __eq__(self, other): return self.__class__ == other.__class__ and \ self.variable == other.variable and \ self.drs == other.drs def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def fol(self): return self.drs.fol() def _pretty(self): drs_s = self.drs._pretty() blank = ' ' * len("%s" % self.variable) return ([blank + ' ' + line for line in drs_s[:1]] + ["%s" % self.variable + ':' + line for line in drs_s[1:2]] + [blank + ' ' + line for line in drs_s[2:]]) def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.drs)]) def visit_structured(self, function, combinator): """:see: Expression.visit_structured()""" return combinator(self.variable, function(self.drs)) def __str__(self): return 'prop(%s, %s)' % (self.variable, self.drs) class DrtNegatedExpression(DrtExpression, NegatedExpression): def fol(self): return NegatedExpression(self.term.fol()) def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" return self.term.get_refs(recursive) def _pretty(self): term_lines = self.term._pretty() return ([' ' + line for line in term_lines[:2]] + ['__ ' + line for line in term_lines[2:3]] + [' | ' + line for line in term_lines[3:4]] + [' ' + line for line in term_lines[4:]]) class DrtLambdaExpression(DrtExpression, LambdaExpression): def alpha_convert(self, newvar): """Rename all occurrences of the variable introduced by this variable binder in the expression to ``newvar``. :param newvar: ``Variable``, for the new variable """ return self.__class__(newvar, self.term.replace(self.variable, DrtVariableExpression(newvar), True)) def fol(self): return LambdaExpression(self.variable, self.term.fol()) def _pretty(self): variables = [self.variable] term = self.term while term.__class__ == self.__class__: variables.append(term.variable) term = term.term var_string = ' '.join("%s" % v for v in variables) + DrtTokens.DOT term_lines = term._pretty() blank = ' ' * len(var_string) return ([' ' + blank + line for line in term_lines[:1]] + [' \ ' + blank + line for line in term_lines[1:2]] + [' /\ ' + var_string + line for line in term_lines[2:3]] + [' ' + blank + line for line in term_lines[3:]]) class DrtBinaryExpression(DrtExpression, BinaryExpression): def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" return self.first.get_refs(True) + self.second.get_refs(True) if recursive else [] def _pretty(self): return DrtBinaryExpression._assemble_pretty(self._pretty_subex(self.first), self.getOp(), self._pretty_subex(self.second)) @staticmethod def _assemble_pretty(first_lines, op, second_lines): max_lines = max(len(first_lines), len(second_lines)) first_lines = _pad_vertically(first_lines, max_lines) second_lines = _pad_vertically(second_lines, max_lines) blank = ' ' * len(op) first_second_lines = list(zip(first_lines, second_lines)) return ([' ' + first_line + ' ' + blank + ' ' + second_line + ' ' for first_line, second_line in first_second_lines[:2]] + ['(' + first_line + ' ' + op + ' ' + second_line + ')' for first_line, second_line in first_second_lines[2:3]] + [' ' + first_line + ' ' + blank + ' ' + second_line + ' ' for first_line, second_line in first_second_lines[3:]]) def _pretty_subex(self, subex): return subex._pretty() class DrtBooleanExpression(DrtBinaryExpression, BooleanExpression): pass class DrtOrExpression(DrtBooleanExpression, OrExpression): def fol(self): return OrExpression(self.first.fol(), self.second.fol()) def _pretty_subex(self, subex): if isinstance(subex, DrtOrExpression): return [line[1:-1] for line in subex._pretty()] return DrtBooleanExpression._pretty_subex(self, subex) class DrtEqualityExpression(DrtBinaryExpression, EqualityExpression): def fol(self): return EqualityExpression(self.first.fol(), self.second.fol()) @python_2_unicode_compatible class DrtConcatenation(DrtBooleanExpression): """DRS of the form '(DRS + DRS)'""" def __init__(self, first, second, consequent=None): DrtBooleanExpression.__init__(self, first, second) self.consequent = consequent def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """Replace all instances of variable v with expression E in self, where v is free in self.""" first = self.first second = self.second consequent = self.consequent # If variable is bound if variable in self.get_refs(): if replace_bound: first = first.replace(variable, expression, replace_bound, alpha_convert) second = second.replace(variable, expression, replace_bound, alpha_convert) if consequent: consequent = consequent.replace(variable, expression, replace_bound, alpha_convert) else: if alpha_convert: # alpha convert every ref that is free in 'expression' for ref in (set(self.get_refs(True)) & expression.free()): v = DrtVariableExpression(unique_variable(ref)) first = first.replace(ref, v, True, alpha_convert) second = second.replace(ref, v, True, alpha_convert) if consequent: consequent = consequent.replace(ref, v, True, alpha_convert) first = first.replace(variable, expression, replace_bound, alpha_convert) second = second.replace(variable, expression, replace_bound, alpha_convert) if consequent: consequent = consequent.replace(variable, expression, replace_bound, alpha_convert) return self.__class__(first, second, consequent) def eliminate_equality(self): #TODO: at some point. for now, simplify. drs = self.simplify() assert not isinstance(drs, DrtConcatenation) return drs.eliminate_equality() def simplify(self): first = self.first.simplify() second = self.second.simplify() consequent = (self.consequent.simplify() if self.consequent else None) if isinstance(first, DRS) and isinstance(second, DRS): # For any ref that is in both 'first' and 'second' for ref in (set(first.get_refs(True)) & set(second.get_refs(True))): # alpha convert the ref in 'second' to prevent collision newvar = DrtVariableExpression(unique_variable(ref)) second = second.replace(ref, newvar, True) return DRS(first.refs + second.refs, first.conds + second.conds, consequent) else: return self.__class__(first, second, consequent) def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" refs = self.first.get_refs(recursive) + self.second.get_refs(recursive) if self.consequent and recursive: refs.extend(self.consequent.get_refs(True)) return refs def getOp(self): return DrtTokens.DRS_CONC def __eq__(self, other): r"""Defines equality modulo alphabetic variance. If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" if isinstance(other, DrtConcatenation): self_refs = self.get_refs() other_refs = other.get_refs() if len(self_refs) == len(other_refs): converted_other = other for (r1,r2) in zip(self_refs, other_refs): varex = self.make_VariableExpression(r1) converted_other = converted_other.replace(r2, varex, True) return self.first == converted_other.first and \ self.second == converted_other.second and \ self.consequent == converted_other.consequent return False def __ne__(self, other): return not self == other __hash__ = DrtBooleanExpression.__hash__ def fol(self): e = AndExpression(self.first.fol(), self.second.fol()) if self.consequent: e = ImpExpression(e, self.consequent.fol()) return e def _pretty(self): drs = DrtBinaryExpression._assemble_pretty(self._pretty_subex(self.first), self.getOp(), self._pretty_subex(self.second)) if self.consequent: drs = DrtBinaryExpression._assemble_pretty(drs, DrtTokens.IMP, self._pretty(self.consequent)) return drs def _pretty_subex(self, subex): if isinstance(subex, DrtConcatenation): return [line[1:-1] for line in subex._pretty()] return DrtBooleanExpression._pretty_subex(self, subex) def visit(self, function, combinator): """:see: Expression.visit()""" if self.consequent: return combinator([function(self.first), function(self.second), function(self.consequent)]) else: return combinator([function(self.first), function(self.second)]) def __str__(self): first = self._str_subex(self.first) second = self._str_subex(self.second) drs = Tokens.OPEN + first + ' ' + self.getOp() \ + ' ' + second + Tokens.CLOSE if self.consequent: return DrtTokens.OPEN + drs + ' ' + DrtTokens.IMP + ' ' + \ "%s" % self.consequent + DrtTokens.CLOSE return drs def _str_subex(self, subex): s = "%s" % subex if isinstance(subex, DrtConcatenation) and subex.consequent is None: return s[1:-1] return s class DrtApplicationExpression(DrtExpression, ApplicationExpression): def fol(self): return ApplicationExpression(self.function.fol(), self.argument.fol()) def get_refs(self, recursive=False): """:see: AbstractExpression.get_refs()""" return (self.function.get_refs(True) + self.argument.get_refs(True) if recursive else []) def _pretty(self): function, args = self.uncurry() function_lines = function._pretty() args_lines = [arg._pretty() for arg in args] max_lines = max(map(len, [function_lines] + args_lines)) function_lines = _pad_vertically(function_lines, max_lines) args_lines = [_pad_vertically(arg_lines, max_lines) for arg_lines in args_lines] func_args_lines = list(zip(function_lines, list(zip(*args_lines)))) return ([func_line + ' ' + ' '.join(args_line) + ' ' for func_line, args_line in func_args_lines[:2]] + [func_line + '(' + ','.join(args_line) + ')' for func_line, args_line in func_args_lines[2:3]] + [func_line + ' ' + ' '.join(args_line) + ' ' for func_line, args_line in func_args_lines[3:]]) def _pad_vertically(lines, max_lines): pad_line = [' ' * len(lines[0])] return lines + pad_line * (max_lines - len(lines)) @python_2_unicode_compatible class PossibleAntecedents(list, DrtExpression, Expression): def free(self): """Set of free variables.""" return set(self) def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """Replace all instances of variable v with expression E in self, where v is free in self.""" result = PossibleAntecedents() for item in self: if item == variable: self.append(expression) else: self.append(item) return result def _pretty(self): s = "%s" % self blank = ' ' * len(s) return [blank, blank, s] def __str__(self): return '[' + ','.join("%s" % it for it in self) + ']' class AnaphoraResolutionException(Exception): pass def resolve_anaphora(expression, trail=[]): if isinstance(expression, ApplicationExpression): if expression.is_pronoun_function(): possible_antecedents = PossibleAntecedents() for ancestor in trail: for ref in ancestor.get_refs(): refex = expression.make_VariableExpression(ref) #========================================================== # Don't allow resolution to itself or other types #========================================================== if refex.__class__ == expression.argument.__class__ and \ not (refex == expression.argument): possible_antecedents.append(refex) if len(possible_antecedents) == 1: resolution = possible_antecedents[0] else: resolution = possible_antecedents return expression.make_EqualityExpression(expression.argument, resolution) else: r_function = resolve_anaphora(expression.function, trail + [expression]) r_argument = resolve_anaphora(expression.argument, trail + [expression]) return expression.__class__(r_function, r_argument) elif isinstance(expression, DRS): r_conds = [] for cond in expression.conds: r_cond = resolve_anaphora(cond, trail + [expression]) # if the condition is of the form '(x = [])' then raise exception if isinstance(r_cond, EqualityExpression): if isinstance(r_cond.first, PossibleAntecedents): #Reverse the order so that the variable is on the left temp = r_cond.first r_cond.first = r_cond.second r_cond.second = temp if isinstance(r_cond.second, PossibleAntecedents): if not r_cond.second: raise AnaphoraResolutionException("Variable '%s' does not " "resolve to anything." % r_cond.first) r_conds.append(r_cond) if expression.consequent: consequent = resolve_anaphora(expression.consequent, trail + [expression]) else: consequent = None return expression.__class__(expression.refs, r_conds, consequent) elif isinstance(expression, AbstractVariableExpression): return expression elif isinstance(expression, NegatedExpression): return expression.__class__(resolve_anaphora(expression.term, trail + [expression])) elif isinstance(expression, DrtConcatenation): if expression.consequent: consequent = resolve_anaphora(expression.consequent, trail + [expression]) else: consequent = None return expression.__class__(resolve_anaphora(expression.first, trail + [expression]), resolve_anaphora(expression.second, trail + [expression]), consequent) elif isinstance(expression, BinaryExpression): return expression.__class__(resolve_anaphora(expression.first, trail + [expression]), resolve_anaphora(expression.second, trail + [expression])) elif isinstance(expression, LambdaExpression): return expression.__class__(expression.variable, resolve_anaphora(expression.term, trail + [expression])) class DrsDrawer(object): BUFFER = 3 #Space between elements TOPSPACE = 10 #Space above whole DRS OUTERSPACE = 6 #Space to the left, right, and bottom of the whle DRS def __init__(self, drs, size_canvas=True, canvas=None): """ :param drs: ``DrtExpression``, The DRS to be drawn :param size_canvas: bool, True if the canvas size should be the exact size of the DRS :param canvas: ``Canvas`` The canvas on which to draw the DRS. If none is given, create a new canvas. """ master = None if not canvas: master = Tk() master.title("DRT") font = Font(family='helvetica', size=12) if size_canvas: canvas = Canvas(master, width=0, height=0) canvas.font = font self.canvas = canvas (right, bottom) = self._visit(drs, self.OUTERSPACE, self.TOPSPACE) width = max(right+self.OUTERSPACE, 100) height = bottom+self.OUTERSPACE canvas = Canvas(master, width=width, height=height)#, bg='white') else: canvas = Canvas(master, width=300, height=300) canvas.pack() canvas.font = font self.canvas = canvas self.drs = drs self.master = master def _get_text_height(self): """Get the height of a line of text""" return self.canvas.font.metrics("linespace") def draw(self, x=OUTERSPACE, y=TOPSPACE): """Draw the DRS""" self._handle(self.drs, self._draw_command, x, y) if self.master and not in_idle(): self.master.mainloop() else: return self._visit(self.drs, x, y) def _visit(self, expression, x, y): """ Return the bottom-rightmost point without actually drawing the item :param expression: the item to visit :param x: the top of the current drawing area :param y: the left side of the current drawing area :return: the bottom-rightmost point """ return self._handle(expression, self._visit_command, x, y) def _draw_command(self, item, x, y): """ Draw the given item at the given location :param item: the item to draw :param x: the top of the current drawing area :param y: the left side of the current drawing area :return: the bottom-rightmost point """ if isinstance(item, string_types): self.canvas.create_text(x, y, anchor='nw', font=self.canvas.font, text=item) elif isinstance(item, tuple): # item is the lower-right of a box (right, bottom) = item self.canvas.create_rectangle(x, y, right, bottom) horiz_line_y = y + self._get_text_height() + (self.BUFFER * 2) #the line separating refs from conds self.canvas.create_line(x, horiz_line_y, right, horiz_line_y) return self._visit_command(item, x, y) def _visit_command(self, item, x, y): """ Return the bottom-rightmost point without actually drawing the item :param item: the item to visit :param x: the top of the current drawing area :param y: the left side of the current drawing area :return: the bottom-rightmost point """ if isinstance(item, string_types): return (x + self.canvas.font.measure(item), y + self._get_text_height()) elif isinstance(item, tuple): return item def _handle(self, expression, command, x=0, y=0): """ :param expression: the expression to handle :param command: the function to apply, either _draw_command or _visit_command :param x: the top of the current drawing area :param y: the left side of the current drawing area :return: the bottom-rightmost point """ if command == self._visit_command: #if we don't need to draw the item, then we can use the cached values try: #attempt to retrieve cached values right = expression._drawing_width + x bottom = expression._drawing_height + y return (right, bottom) except AttributeError: #the values have not been cached yet, so compute them pass if isinstance(expression, DrtAbstractVariableExpression): factory = self._handle_VariableExpression elif isinstance(expression, DRS): factory = self._handle_DRS elif isinstance(expression, DrtNegatedExpression): factory = self._handle_NegatedExpression elif isinstance(expression, DrtLambdaExpression): factory = self._handle_LambdaExpression elif isinstance(expression, BinaryExpression): factory = self._handle_BinaryExpression elif isinstance(expression, DrtApplicationExpression): factory = self._handle_ApplicationExpression elif isinstance(expression, PossibleAntecedents): factory = self._handle_VariableExpression elif isinstance(expression, DrtProposition): factory = self._handle_DrtProposition else: raise Exception(expression.__class__.__name__) (right, bottom) = factory(expression, command, x, y) #cache the values expression._drawing_width = right - x expression._drawing_height = bottom - y return (right, bottom) def _handle_VariableExpression(self, expression, command, x, y): return command("%s" % expression, x, y) def _handle_NegatedExpression(self, expression, command, x, y): # Find the width of the negation symbol right = self._visit_command(DrtTokens.NOT, x, y)[0] # Handle term (right, bottom) = self._handle(expression.term, command, right, y) # Handle variables now that we know the y-coordinate command(DrtTokens.NOT, x, self._get_centered_top(y, bottom - y, self._get_text_height())) return (right, bottom) def _handle_DRS(self, expression, command, x, y): left = x + self.BUFFER #indent the left side bottom = y + self.BUFFER #indent the top # Handle Discourse Referents if expression.refs: refs = ' '.join("%s"%r for r in expression.refs) else: refs = ' ' (max_right, bottom) = command(refs, left, bottom) bottom += (self.BUFFER * 2) # Handle Conditions if expression.conds: for cond in expression.conds: (right, bottom) = self._handle(cond, command, left, bottom) max_right = max(max_right, right) bottom += self.BUFFER else: bottom += self._get_text_height() + self.BUFFER # Handle Box max_right += self.BUFFER return command((max_right, bottom), x, y) def _handle_ApplicationExpression(self, expression, command, x, y): function, args = expression.uncurry() if not isinstance(function, DrtAbstractVariableExpression): #It's not a predicate expression ("P(x,y)"), so leave arguments curried function = expression.function args = [expression.argument] # Get the max bottom of any element on the line function_bottom = self._visit(function, x, y)[1] max_bottom = max([function_bottom] + [self._visit(arg, x, y)[1] for arg in args]) line_height = max_bottom - y # Handle 'function' function_drawing_top = self._get_centered_top(y, line_height, function._drawing_height) right = self._handle(function, command, x, function_drawing_top)[0] # Handle open paren centred_string_top = self._get_centered_top(y, line_height, self._get_text_height()) right = command(DrtTokens.OPEN, right, centred_string_top)[0] # Handle each arg for (i,arg) in enumerate(args): arg_drawing_top = self._get_centered_top(y, line_height, arg._drawing_height) right = self._handle(arg, command, right, arg_drawing_top)[0] if i+1 < len(args): #since it's not the last arg, add a comma right = command(DrtTokens.COMMA + ' ', right, centred_string_top)[0] # Handle close paren right = command(DrtTokens.CLOSE, right, centred_string_top)[0] return (right, max_bottom) def _handle_LambdaExpression(self, expression, command, x, y): # Find the width of the lambda symbol and abstracted variables variables = DrtTokens.LAMBDA + "%s" % expression.variable + DrtTokens.DOT right = self._visit_command(variables, x, y)[0] # Handle term (right, bottom) = self._handle(expression.term, command, right, y) # Handle variables now that we know the y-coordinate command(variables, x, self._get_centered_top(y, bottom - y, self._get_text_height())) return (right, bottom) def _handle_BinaryExpression(self, expression, command, x, y): # Get the full height of the line, based on the operands first_height = self._visit(expression.first, 0, 0)[1] second_height = self._visit(expression.second, 0, 0)[1] line_height = max(first_height, second_height) # Handle open paren centred_string_top = self._get_centered_top(y, line_height, self._get_text_height()) right = command(DrtTokens.OPEN, x, centred_string_top)[0] # Handle the first operand first_height = expression.first._drawing_height (right, first_bottom) = self._handle(expression.first, command, right, self._get_centered_top(y, line_height, first_height)) # Handle the operator right = command(' %s ' % expression.getOp(), right, centred_string_top)[0] # Handle the second operand second_height = expression.second._drawing_height (right, second_bottom) = self._handle(expression.second, command, right, self._get_centered_top(y, line_height, second_height)) # Handle close paren right = command(DrtTokens.CLOSE, right, centred_string_top)[0] return (right, max(first_bottom, second_bottom)) def _handle_DrtProposition(self, expression, command, x, y): # Find the width of the negation symbol right = command(expression.variable, x, y)[0] # Handle term (right, bottom) = self._handle(expression.term, command, right, y) return (right, bottom) def _get_centered_top(self, top, full_height, item_height): """Get the y-coordinate of the point that a figure should start at if its height is 'item_height' and it needs to be centered in an area that starts at 'top' and is 'full_height' tall.""" return top + (full_height - item_height) / 2 def demo(): print('='*20 + 'TEST PARSE' + '='*20) dexpr = DrtExpression.fromstring print(dexpr(r'([x,y],[sees(x,y)])')) print(dexpr(r'([x],[man(x), walks(x)])')) print(dexpr(r'\x.\y.([],[sees(x,y)])')) print(dexpr(r'\x.([],[walks(x)])(john)')) print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))')) print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))')) print(dexpr(r'([x],[PRO(x), sees(John,x)])')) print(dexpr(r'([x],[man(x), -([],[walks(x)])])')) print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])')) print('='*20 + 'Test fol()' + '='*20) print(dexpr(r'([x,y],[sees(x,y)])').fol()) print('='*20 + 'Test alpha conversion and lambda expression equality' + '='*20) e1 = dexpr(r'\x.([],[P(x)])') print(e1) e2 = e1.alpha_convert(Variable('z')) print(e2) print(e1 == e2) print('='*20 + 'Test resolve_anaphora()' + '='*20) print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])'))) print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])'))) print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))'))) print('='*20 + 'Test pretty_print()' + '='*20) dexpr(r"([],[])").pretty_print() dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print() dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print() dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print() dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print() def test_draw(): try: from tkinter import Tk except ImportError: from nose import SkipTest raise SkipTest("tkinter is required, but it's not available.") expressions = [ r'x', r'([],[])', r'([x],[])', r'([x],[man(x)])', r'([x,y],[sees(x,y)])', r'([x],[man(x), walks(x)])', r'\x.([],[man(x), walks(x)])', r'\x y.([],[sees(x,y)])', r'([],[(([],[walks(x)]) + ([],[runs(x)]))])', r'([x],[man(x), -([],[walks(x)])])', r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])' ] for e in expressions: d = DrtExpression.fromstring(e) d.draw() if __name__ == '__main__': demo() nltk-3.1/nltk/sem/drt_glue_demo.py0000644000076500000240000004376012607224144017007 0ustar sbstaff00000000000000# Natural Language Toolkit: GUI Demo for Glue Semantics with Discourse # Representation Theory (DRT) as meaning language # # Author: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from nltk import compat # this fixes tkinter imports for Python 2.x try: from tkinter.font import Font from tkinter import (Button, Frame, IntVar, Label, Listbox, Menu, Scrollbar, Tk) from nltk.draw.util import CanvasFrame, ShowText except ImportError: """Ignore ImportError because tkinter might not be available.""" from nltk.util import in_idle from nltk.tag import RegexpTagger from nltk.parse import MaltParser from nltk.sem.logic import Variable from nltk.sem.drt import DrsDrawer, DrtVariableExpression from nltk.sem.glue import DrtGlue class DrtGlueDemo(object): def __init__(self, examples): # Set up the main window. self._top = Tk() self._top.title('DRT Glue Demo') # Set up key bindings. self._init_bindings() # Initialize the fonts.self._error = None self._init_fonts(self._top) self._examples = examples self._readingCache = [None for example in examples] # The user can hide the grammar. self._show_grammar = IntVar(self._top) self._show_grammar.set(1) # Set the data to None self._curExample = -1 self._readings = [] self._drs = None self._drsWidget = None self._error = None self._init_glue() # Create the basic frames. self._init_menubar(self._top) self._init_buttons(self._top) self._init_exampleListbox(self._top) self._init_readingListbox(self._top) self._init_canvas(self._top) # Resize callback self._canvas.bind('', self._configure) ######################################### ## Initialization Helpers ######################################### def _init_glue(self): tagger = RegexpTagger( [('^(David|Mary|John)$', 'NNP'), ('^(walks|sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'), ('^(go|order|vanish|find|approach)$', 'VB'), ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'), ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'), ('^(big|gray|former)$', 'JJ'), ('^(him|himself)$', 'PRP') ]) depparser = MaltParser(tagger=tagger) self._glue = DrtGlue(depparser=depparser, remove_duplicates=False) def _init_fonts(self, root): # See: self._sysfont = Font(font=Button()["font"]) root.option_add("*Font", self._sysfont) # TWhat's our font size (default=same as sysfont) self._size = IntVar(root) self._size.set(self._sysfont.cget('size')) self._boldfont = Font(family='helvetica', weight='bold', size=self._size.get()) self._font = Font(family='helvetica', size=self._size.get()) if self._size.get() < 0: big = self._size.get()-2 else: big = self._size.get()+2 self._bigfont = Font(family='helvetica', weight='bold', size=big) def _init_exampleListbox(self, parent): self._exampleFrame = listframe = Frame(parent) self._exampleFrame.pack(fill='both', side='left', padx=2) self._exampleList_label = Label(self._exampleFrame, font=self._boldfont, text='Examples') self._exampleList_label.pack() self._exampleList = Listbox(self._exampleFrame, selectmode='single', relief='groove', background='white', foreground='#909090', font=self._font, selectforeground='#004040', selectbackground='#c0f0c0') self._exampleList.pack(side='right', fill='both', expand=1) for example in self._examples: self._exampleList.insert('end', (' %s' % example)) self._exampleList.config(height=min(len(self._examples), 25), width=40) # Add a scrollbar if there are more than 25 examples. if len(self._examples) > 25: listscroll = Scrollbar(self._exampleFrame, orient='vertical') self._exampleList.config(yscrollcommand = listscroll.set) listscroll.config(command=self._exampleList.yview) listscroll.pack(side='left', fill='y') # If they select a example, apply it. self._exampleList.bind('<>', self._exampleList_select) def _init_readingListbox(self, parent): self._readingFrame = listframe = Frame(parent) self._readingFrame.pack(fill='both', side='left', padx=2) self._readingList_label = Label(self._readingFrame, font=self._boldfont, text='Readings') self._readingList_label.pack() self._readingList = Listbox(self._readingFrame, selectmode='single', relief='groove', background='white', foreground='#909090', font=self._font, selectforeground='#004040', selectbackground='#c0f0c0') self._readingList.pack(side='right', fill='both', expand=1) # Add a scrollbar if there are more than 25 examples. listscroll = Scrollbar(self._readingFrame, orient='vertical') self._readingList.config(yscrollcommand = listscroll.set) listscroll.config(command=self._readingList.yview) listscroll.pack(side='right', fill='y') self._populate_readingListbox() def _populate_readingListbox(self): # Populate the listbox with integers self._readingList.delete(0, 'end') for i in range(len(self._readings)): self._readingList.insert('end', (' %s' % (i+1))) self._readingList.config(height=min(len(self._readings), 25), width=5) # If they select a example, apply it. self._readingList.bind('<>', self._readingList_select) def _init_bindings(self): # Key bindings are a good thing. self._top.bind('', self.destroy) self._top.bind('', self.destroy) self._top.bind('', self.destroy) self._top.bind('n', self.next) self._top.bind('', self.next) self._top.bind('p', self.prev) self._top.bind('', self.prev) def _init_buttons(self, parent): # Set up the frames. self._buttonframe = buttonframe = Frame(parent) buttonframe.pack(fill='none', side='bottom', padx=3, pady=2) Button(buttonframe, text='Prev', background='#90c0d0', foreground='black', command=self.prev,).pack(side='left') Button(buttonframe, text='Next', background='#90c0d0', foreground='black', command=self.next,).pack(side='left') def _configure(self, event): self._autostep = 0 (x1, y1, x2, y2) = self._cframe.scrollregion() y2 = event.height - 6 self._canvas['scrollregion'] = '%d %d %d %d' % (x1,y1,x2,y2) self._redraw() def _init_canvas(self, parent): self._cframe = CanvasFrame(parent, background='white', #width=525, height=250, closeenough=10, border=2, relief='sunken') self._cframe.pack(expand=1, fill='both', side='top', pady=2) canvas = self._canvas = self._cframe.canvas() # Initially, there's no tree or text self._tree = None self._textwidgets = [] self._textline = None def _init_menubar(self, parent): menubar = Menu(parent) filemenu = Menu(menubar, tearoff=0) filemenu.add_command(label='Exit', underline=1, command=self.destroy, accelerator='q') menubar.add_cascade(label='File', underline=0, menu=filemenu) actionmenu = Menu(menubar, tearoff=0) actionmenu.add_command(label='Next', underline=0, command=self.next, accelerator='n, Space') actionmenu.add_command(label='Previous', underline=0, command=self.prev, accelerator='p, Backspace') menubar.add_cascade(label='Action', underline=0, menu=actionmenu) optionmenu = Menu(menubar, tearoff=0) optionmenu.add_checkbutton(label='Remove Duplicates', underline=0, variable=self._glue.remove_duplicates, command=self._toggle_remove_duplicates, accelerator='r') menubar.add_cascade(label='Options', underline=0, menu=optionmenu) viewmenu = Menu(menubar, tearoff=0) viewmenu.add_radiobutton(label='Tiny', variable=self._size, underline=0, value=10, command=self.resize) viewmenu.add_radiobutton(label='Small', variable=self._size, underline=0, value=12, command=self.resize) viewmenu.add_radiobutton(label='Medium', variable=self._size, underline=0, value=14, command=self.resize) viewmenu.add_radiobutton(label='Large', variable=self._size, underline=0, value=18, command=self.resize) viewmenu.add_radiobutton(label='Huge', variable=self._size, underline=0, value=24, command=self.resize) menubar.add_cascade(label='View', underline=0, menu=viewmenu) helpmenu = Menu(menubar, tearoff=0) helpmenu.add_command(label='About', underline=0, command=self.about) menubar.add_cascade(label='Help', underline=0, menu=helpmenu) parent.config(menu=menubar) ######################################### ## Main draw procedure ######################################### def _redraw(self): canvas = self._canvas # Delete the old DRS, widgets, etc. if self._drsWidget is not None: self._drsWidget.clear() if self._drs: self._drsWidget = DrsWidget( self._canvas, self._drs ) self._drsWidget.draw() if self._error: self._drsWidget = DrsWidget( self._canvas, self._error ) self._drsWidget.draw() ######################################### ## Button Callbacks ######################################### def destroy(self, *e): self._autostep = 0 if self._top is None: return self._top.destroy() self._top = None def prev(self, *e): selection = self._readingList.curselection() readingListSize = self._readingList.size() # there are readings if readingListSize > 0: # if one reading is currently selected if len(selection) == 1: index = int(selection[0]) # if it's on (or before) the first item if index <= 0: self._select_previous_example() else: self._readingList_store_selection(index-1) else: #select its first reading self._readingList_store_selection(readingListSize-1) else: self._select_previous_example() def _select_previous_example(self): #if the current example is not the first example if self._curExample > 0: self._exampleList_store_selection(self._curExample-1) else: #go to the last example self._exampleList_store_selection(len(self._examples)-1) def next(self, *e): selection = self._readingList.curselection() readingListSize = self._readingList.size() # if there are readings if readingListSize > 0: # if one reading is currently selected if len(selection) == 1: index = int(selection[0]) # if it's on (or past) the last item if index >= (readingListSize-1): self._select_next_example() else: self._readingList_store_selection(index+1) else: #select its first reading self._readingList_store_selection(0) else: self._select_next_example() def _select_next_example(self): #if the current example is not the last example if self._curExample < len(self._examples)-1: self._exampleList_store_selection(self._curExample+1) else: #go to the first example self._exampleList_store_selection(0) def about(self, *e): ABOUT = ("NLTK Discourse Representation Theory (DRT) Glue Semantics Demo\n"+ "Written by Daniel H. Garrette") TITLE = 'About: NLTK DRT Glue Demo' try: from tkMessageBox import Message Message(message=ABOUT, title=TITLE).show() except: ShowText(self._top, TITLE, ABOUT) def postscript(self, *e): self._autostep = 0 self._cframe.print_to_file() def mainloop(self, *args, **kwargs): """ Enter the Tkinter mainloop. This function must be called if this demo is created from a non-interactive program (e.g. from a secript); otherwise, the demo will close as soon as the script completes. """ if in_idle(): return self._top.mainloop(*args, **kwargs) def resize(self, size=None): if size is not None: self._size.set(size) size = self._size.get() self._font.configure(size=-(abs(size))) self._boldfont.configure(size=-(abs(size))) self._sysfont.configure(size=-(abs(size))) self._bigfont.configure(size=-(abs(size+2))) self._redraw() def _toggle_remove_duplicates(self): self._glue.remove_duplicates = not self._glue.remove_duplicates self._exampleList.selection_clear(0, 'end') self._readings = [] self._populate_readingListbox() self._readingCache = [None for ex in self._examples] self._curExample = -1 self._error = None self._drs = None self._redraw() def _exampleList_select(self, event): selection = self._exampleList.curselection() if len(selection) != 1: return self._exampleList_store_selection(int(selection[0])) def _exampleList_store_selection(self, index): self._curExample = index example = self._examples[index] self._exampleList.selection_clear(0, 'end') if example: cache = self._readingCache[index] if cache: if isinstance(cache, list): self._readings = cache self._error = None else: self._readings = [] self._error = cache else: try: self._readings = self._glue.parse_to_meaning(example) self._error = None self._readingCache[index] = self._readings except Exception as e: self._readings = [] self._error = DrtVariableExpression(Variable('Error: ' + str(e))) self._readingCache[index] = self._error #add a star to the end of the example self._exampleList.delete(index) self._exampleList.insert(index, (' %s *' % example)) self._exampleList.config(height=min(len(self._examples), 25), width=40) self._populate_readingListbox() self._exampleList.selection_set(index) self._drs = None self._redraw() def _readingList_select(self, event): selection = self._readingList.curselection() if len(selection) != 1: return self._readingList_store_selection(int(selection[0])) def _readingList_store_selection(self, index): reading = self._readings[index] self._readingList.selection_clear(0, 'end') if reading: self._readingList.selection_set(index) self._drs = reading.simplify().normalize().resolve_anaphora() self._redraw() class DrsWidget(object): def __init__(self, canvas, drs, **attribs): self._drs = drs self._canvas = canvas canvas.font = Font(font=canvas.itemcget(canvas.create_text(0, 0, text=''), 'font')) canvas._BUFFER = 3 self.bbox = (0, 0, 0, 0) def draw(self): (right, bottom) = DrsDrawer(self._drs, canvas=self._canvas).draw() self.bbox = (0, 0, right+1, bottom+1) def clear(self): self._canvas.create_rectangle(self.bbox, fill="white", width="0" ) def demo(): examples = ['John walks', 'David sees Mary', 'David eats a sandwich', 'every man chases a dog', # 'every man believes a dog yawns', # 'John gives David a sandwich', 'John chases himself', # 'John persuades David to order a pizza', # 'John tries to go', # 'John tries to find a unicorn', # 'John seems to vanish', # 'a unicorn seems to approach', # 'every big cat leaves', # 'every gray cat leaves', # 'every big gray cat leaves', # 'a former senator leaves', # 'John likes a cat', # 'John likes every cat', # 'he walks', # 'John walks and he leaves' ] DrtGlueDemo(examples).mainloop() if __name__ == '__main__': demo() nltk-3.1/nltk/sem/evaluate.py0000644000076500000240000006140112607224144015774 0ustar sbstaff00000000000000# Natural Language Toolkit: Models for first-order languages with lambda # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein , # URL: # For license information, see LICENSE.TXT #TODO: #- fix tracing #- fix iterator-based approach to existentials """ This module provides data structures for representing first-order models. """ from __future__ import print_function, unicode_literals from pprint import pformat import inspect import textwrap import re from nltk.decorators import decorator # this used in code that is commented out from nltk.compat import string_types, python_2_unicode_compatible from nltk.sem.logic import (AbstractVariableExpression, AllExpression, Expression, AndExpression, ApplicationExpression, EqualityExpression, ExistsExpression, IffExpression, ImpExpression, IndividualVariableExpression, LambdaExpression, NegatedExpression, OrExpression, Variable, is_indvar) class Error(Exception): pass class Undefined(Error): pass def trace(f, *args, **kw): argspec = inspect.getargspec(f) d = dict(zip(argspec[0], args)) if d.pop('trace', None): print() for item in d.items(): print("%s => %s" % item) return f(*args, **kw) def is_rel(s): """ Check whether a set represents a relation (of any arity). :param s: a set containing tuples of str elements :type s: set :rtype: bool """ # we have the empty relation, i.e. set() if len(s) == 0: return True # all the elements are tuples of the same length elif all(isinstance(el, tuple) for el in s) and len(max(s))==len(min(s)): return True else: raise ValueError("Set %r contains sequences of different lengths" % s) def set2rel(s): """ Convert a set containing individuals (strings or numbers) into a set of unary tuples. Any tuples of strings already in the set are passed through unchanged. For example: - set(['a', 'b']) => set([('a',), ('b',)]) - set([3, 27]) => set([('3',), ('27',)]) :type s: set :rtype: set of tuple of str """ new = set() for elem in s: if isinstance(elem, string_types): new.add((elem,)) elif isinstance(elem, int): new.add((str(elem,))) else: new.add(elem) return new def arity(rel): """ Check the arity of a relation. :type rel: set of tuples :rtype: int of tuple of str """ if len(rel) == 0: return 0 return len(list(rel)[0]) @python_2_unicode_compatible class Valuation(dict): """ A dictionary which represents a model-theoretic Valuation of non-logical constants. Keys are strings representing the constants to be interpreted, and values correspond to individuals (represented as strings) and n-ary relations (represented as sets of tuples of strings). An instance of ``Valuation`` will raise a KeyError exception (i.e., just behave like a standard dictionary) if indexed with an expression that is not in its list of symbols. """ def __init__(self, xs): """ :param xs: a list of (symbol, value) pairs. """ super(Valuation, self).__init__() for (sym, val) in xs: if isinstance(val, string_types) or isinstance(val, bool): self[sym] = val elif isinstance(val, set): self[sym] = set2rel(val) else: msg = textwrap.fill("Error in initializing Valuation. " "Unrecognized value for symbol '%s':\n%s" % (sym, val), width=66) raise ValueError(msg) def __getitem__(self, key): if key in self: return dict.__getitem__(self, key) else: raise Undefined("Unknown expression: '%s'" % key) def __str__(self): return pformat(self) @property def domain(self): """Set-theoretic domain of the value-space of a Valuation.""" dom = [] for val in self.values(): if isinstance(val, string_types): dom.append(val) elif not isinstance(val, bool): dom.extend([elem for tuple_ in val for elem in tuple_ if elem is not None]) return set(dom) @property def symbols(self): """The non-logical constants which the Valuation recognizes.""" return sorted(self.keys()) @classmethod def fromstring(cls, s): return read_valuation(s) ########################################## # REs used by the _read_valuation function ########################################## _VAL_SPLIT_RE = re.compile(r'\s*=+>\s*') _ELEMENT_SPLIT_RE = re.compile(r'\s*,\s*') _TUPLES_RE = re.compile(r"""\s* (\([^)]+\)) # tuple-expression \s*""", re.VERBOSE) def _read_valuation_line(s): """ Read a line in a valuation file. Lines are expected to be of the form:: noosa => n girl => {g1, g2} chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)} :param s: input line :type s: str :return: a pair (symbol, value) :rtype: tuple """ pieces = _VAL_SPLIT_RE.split(s) symbol = pieces[0] value = pieces[1] # check whether the value is meant to be a set if value.startswith('{'): value = value[1:-1] tuple_strings = _TUPLES_RE.findall(value) # are the set elements tuples? if tuple_strings: set_elements = [] for ts in tuple_strings: ts = ts[1:-1] element = tuple(_ELEMENT_SPLIT_RE.split(ts)) set_elements.append(element) else: set_elements = _ELEMENT_SPLIT_RE.split(value) value = set(set_elements) return symbol, value def read_valuation(s, encoding=None): """ Convert a valuation string into a valuation. :param s: a valuation string :type s: str :param encoding: the encoding of the input string, if it is binary :type encoding: str :return: a ``nltk.sem`` valuation :rtype: Valuation """ if encoding is not None: s = s.decode(encoding) statements = [] for linenum, line in enumerate(s.splitlines()): line = line.strip() if line.startswith('#') or line=='': continue try: statements.append(_read_valuation_line(line)) except ValueError: raise ValueError('Unable to parse line %s: %s' % (linenum, line)) return Valuation(statements) @python_2_unicode_compatible class Assignment(dict): """ A dictionary which represents an assignment of values to variables. An assigment can only assign values from its domain. If an unknown expression *a* is passed to a model *M*\ 's interpretation function *i*, *i* will first check whether *M*\ 's valuation assigns an interpretation to *a* as a constant, and if this fails, *i* will delegate the interpretation of *a* to *g*. *g* only assigns values to individual variables (i.e., members of the class ``IndividualVariableExpression`` in the ``logic`` module. If a variable is not assigned a value by *g*, it will raise an ``Undefined`` exception. A variable *Assignment* is a mapping from individual variables to entities in the domain. Individual variables are usually indicated with the letters ``'x'``, ``'y'``, ``'w'`` and ``'z'``, optionally followed by an integer (e.g., ``'x0'``, ``'y332'``). Assignments are created using the ``Assignment`` constructor, which also takes the domain as a parameter. >>> from nltk.sem.evaluate import Assignment >>> dom = set(['u1', 'u2', 'u3', 'u4']) >>> g3 = Assignment(dom, [('x', 'u1'), ('y', 'u2')]) >>> g3 == {'x': 'u1', 'y': 'u2'} True There is also a ``print`` format for assignments which uses a notation closer to that in logic textbooks: >>> print(g3) g[u1/x][u2/y] It is also possible to update an assignment using the ``add`` method: >>> dom = set(['u1', 'u2', 'u3', 'u4']) >>> g4 = Assignment(dom) >>> g4.add('x', 'u1') {'x': 'u1'} With no arguments, ``purge()`` is equivalent to ``clear()`` on a dictionary: >>> g4.purge() >>> g4 {} :param domain: the domain of discourse :type domain: set :param assign: a list of (varname, value) associations :type assign: list """ def __init__(self, domain, assign=None): super(Assignment, self).__init__() self.domain = domain if assign: for (var, val) in assign: assert val in self.domain,\ "'%s' is not in the domain: %s" % (val, self.domain) assert is_indvar(var),\ "Wrong format for an Individual Variable: '%s'" % var self[var] = val self.variant = None self._addvariant() def __getitem__(self, key): if key in self: return dict.__getitem__(self, key) else: raise Undefined("Not recognized as a variable: '%s'" % key) def copy(self): new = Assignment(self.domain) new.update(self) return new def purge(self, var=None): """ Remove one or all keys (i.e. logic variables) from an assignment, and update ``self.variant``. :param var: a Variable acting as a key for the assignment. """ if var: del self[var] else: self.clear() self._addvariant() return None def __str__(self): """ Pretty printing for assignments. {'x', 'u'} appears as 'g[u/x]' """ gstring = "g" # Deterministic output for unit testing. variant = sorted(self.variant) for (val, var) in variant: gstring += "[%s/%s]" % (val, var) return gstring def _addvariant(self): """ Create a more pretty-printable version of the assignment. """ list_ = [] for item in self.items(): pair = (item[1], item[0]) list_.append(pair) self.variant = list_ return None def add(self, var, val): """ Add a new variable-value pair to the assignment, and update ``self.variant``. """ assert val in self.domain,\ "%s is not in the domain %s" % (val, self.domain) assert is_indvar(var),\ "Wrong format for an Individual Variable: '%s'" % var self[var] = val self._addvariant() return self @python_2_unicode_compatible class Model(object): """ A first order model is a domain *D* of discourse and a valuation *V*. A domain *D* is a set, and a valuation *V* is a map that associates expressions with values in the model. The domain of *V* should be a subset of *D*. Construct a new ``Model``. :type domain: set :param domain: A set of entities representing the domain of discourse of the model. :type valuation: Valuation :param valuation: the valuation of the model. :param prop: If this is set, then we are building a propositional\ model and don't require the domain of *V* to be subset of *D*. """ def __init__(self, domain, valuation): assert isinstance(domain, set) self.domain = domain self.valuation = valuation if not domain.issuperset(valuation.domain): raise Error("The valuation domain, %s, must be a subset of the model's domain, %s"\ % (valuation.domain, domain)) def __repr__(self): return "(%r, %r)" % (self.domain, self.valuation) def __str__(self): return "Domain = %s,\nValuation = \n%s" % (self.domain, self.valuation) def evaluate(self, expr, g, trace=None): """ Read input expressions, and provide a handler for ``satisfy`` that blocks further propagation of the ``Undefined`` error. :param expr: An ``Expression`` of ``logic``. :type g: Assignment :param g: an assignment to individual variables. :rtype: bool or 'Undefined' """ try: parsed = Expression.fromstring(expr) value = self.satisfy(parsed, g, trace=trace) if trace: print() print("'%s' evaluates to %s under M, %s" % (expr, value, g)) return value except Undefined: if trace: print() print("'%s' is undefined under M, %s" % (expr, g)) return 'Undefined' def satisfy(self, parsed, g, trace=None): """ Recursive interpretation function for a formula of first-order logic. Raises an ``Undefined`` error when ``parsed`` is an atomic string but is not a symbol or an individual variable. :return: Returns a truth value or ``Undefined`` if ``parsed`` is\ complex, and calls the interpretation function ``i`` if ``parsed``\ is atomic. :param parsed: An expression of ``logic``. :type g: Assignment :param g: an assignment to individual variables. """ if isinstance(parsed, ApplicationExpression): function, arguments = parsed.uncurry() if isinstance(function, AbstractVariableExpression): #It's a predicate expression ("P(x,y)"), so used uncurried arguments funval = self.satisfy(function, g) argvals = tuple(self.satisfy(arg, g) for arg in arguments) return argvals in funval else: #It must be a lambda expression, so use curried form funval = self.satisfy(parsed.function, g) argval = self.satisfy(parsed.argument, g) return funval[argval] elif isinstance(parsed, NegatedExpression): return not self.satisfy(parsed.term, g) elif isinstance(parsed, AndExpression): return self.satisfy(parsed.first, g) and \ self.satisfy(parsed.second, g) elif isinstance(parsed, OrExpression): return self.satisfy(parsed.first, g) or \ self.satisfy(parsed.second, g) elif isinstance(parsed, ImpExpression): return (not self.satisfy(parsed.first, g)) or \ self.satisfy(parsed.second, g) elif isinstance(parsed, IffExpression): return self.satisfy(parsed.first, g) == \ self.satisfy(parsed.second, g) elif isinstance(parsed, EqualityExpression): return self.satisfy(parsed.first, g) == \ self.satisfy(parsed.second, g) elif isinstance(parsed, AllExpression): new_g = g.copy() for u in self.domain: new_g.add(parsed.variable.name, u) if not self.satisfy(parsed.term, new_g): return False return True elif isinstance(parsed, ExistsExpression): new_g = g.copy() for u in self.domain: new_g.add(parsed.variable.name, u) if self.satisfy(parsed.term, new_g): return True return False elif isinstance(parsed, LambdaExpression): cf = {} var = parsed.variable.name for u in self.domain: val = self.satisfy(parsed.term, g.add(var, u)) # NB the dict would be a lot smaller if we do this: # if val: cf[u] = val # But then need to deal with cases where f(a) should yield # a function rather than just False. cf[u] = val return cf else: return self.i(parsed, g, trace) #@decorator(trace_eval) def i(self, parsed, g, trace=False): """ An interpretation function. Assuming that ``parsed`` is atomic: - if ``parsed`` is a non-logical constant, calls the valuation *V* - else if ``parsed`` is an individual variable, calls assignment *g* - else returns ``Undefined``. :param parsed: an ``Expression`` of ``logic``. :type g: Assignment :param g: an assignment to individual variables. :return: a semantic value """ # If parsed is a propositional letter 'p', 'q', etc, it could be in valuation.symbols # and also be an IndividualVariableExpression. We want to catch this first case. # So there is a procedural consequence to the ordering of clauses here: if parsed.variable.name in self.valuation.symbols: return self.valuation[parsed.variable.name] elif isinstance(parsed, IndividualVariableExpression): return g[parsed.variable.name] else: raise Undefined("Can't find a value for %s" % parsed) def satisfiers(self, parsed, varex, g, trace=None, nesting=0): """ Generate the entities from the model's domain that satisfy an open formula. :param parsed: an open formula :type parsed: Expression :param varex: the relevant free individual variable in ``parsed``. :type varex: VariableExpression or str :param g: a variable assignment :type g: Assignment :return: a set of the entities that satisfy ``parsed``. """ spacer = ' ' indent = spacer + (spacer * nesting) candidates = [] if isinstance(varex, string_types): var = Variable(varex) else: var = varex if var in parsed.free(): if trace: print() print((spacer * nesting) + "Open formula is '%s' with assignment %s" % (parsed, g)) for u in self.domain: new_g = g.copy() new_g.add(var.name, u) if trace and trace > 1: lowtrace = trace-1 else: lowtrace = 0 value = self.satisfy(parsed, new_g, lowtrace) if trace: print(indent + "(trying assignment %s)" % new_g) # parsed == False under g[u/var]? if value == False: if trace: print(indent + "value of '%s' under %s is False" % (parsed, new_g)) # so g[u/var] is a satisfying assignment else: candidates.append(u) if trace: print(indent + "value of '%s' under %s is %s" % (parsed, new_g, value)) result = set(c for c in candidates) # var isn't free in parsed else: raise Undefined("%s is not free in %s" % (var.name, parsed)) return result #////////////////////////////////////////////////////////////////////// # Demo.. #////////////////////////////////////////////////////////////////////// # number of spacer chars mult = 30 # Demo 1: Propositional Logic ################# def propdemo(trace=None): """Example of a propositional model.""" global val1, dom1, m1, g1 val1 = Valuation([('P', True), ('Q', True), ('R', False)]) dom1 = set([]) m1 = Model(dom1, val1) g1 = Assignment(dom1) print() print('*' * mult) print("Propositional Formulas Demo") print('*' * mult) print('(Propositional constants treated as nullary predicates)') print() print("Model m1:\n", m1) print('*' * mult) sentences = [ '(P & Q)', '(P & R)', '- P', '- R', '- - P', '- (P & R)', '(P | R)', '(R | P)', '(R | R)', '(- P | R)', '(P | - P)', '(P -> Q)', '(P -> R)', '(R -> P)', '(P <-> P)', '(R <-> R)', '(P <-> R)', ] for sent in sentences: if trace: print() m1.evaluate(sent, g1, trace) else: print("The value of '%s' is: %s" % (sent, m1.evaluate(sent, g1))) # Demo 2: FOL Model ############# def folmodel(quiet=False, trace=None): """Example of a first-order model.""" global val2, v2, dom2, m2, g2 v2 = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\ ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])), ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] val2 = Valuation(v2) dom2 = val2.domain m2 = Model(dom2, val2) g2 = Assignment(dom2, [('x', 'b1'), ('y', 'g2')]) if not quiet: print() print('*' * mult) print("Models Demo") print("*" * mult) print("Model m2:\n", "-" * 14,"\n", m2) print("Variable assignment = ", g2) exprs = ['adam', 'boy', 'love', 'walks', 'x', 'y', 'z'] parsed_exprs = [Expression.fromstring(e) for e in exprs] print() for parsed in parsed_exprs: try: print("The interpretation of '%s' in m2 is %s" % (parsed, m2.i(parsed, g2))) except Undefined: print("The interpretation of '%s' in m2 is Undefined" % parsed) applications = [('boy', ('adam')), ('walks', ('adam',)), ('love', ('adam', 'y')), ('love', ('y', 'adam'))] for (fun, args) in applications: try: funval = m2.i(Expression.fromstring(fun), g2) argsval = tuple(m2.i(Expression.fromstring(arg), g2) for arg in args) print("%s(%s) evaluates to %s" % (fun, args, argsval in funval)) except Undefined: print("%s(%s) evaluates to Undefined" % (fun, args)) # Demo 3: FOL ######### def foldemo(trace=None): """ Interpretation of closed expressions in a first-order model. """ folmodel(quiet=True) print() print('*' * mult) print("FOL Formulas Demo") print('*' * mult) formulas = [ 'love (adam, betty)', '(adam = mia)', '\\x. (boy(x) | girl(x))', '\\x. boy(x)(adam)', '\\x y. love(x, y)', '\\x y. love(x, y)(adam)(betty)', '\\x y. love(x, y)(adam, betty)', '\\x y. (boy(x) & love(x, y))', '\\x. exists y. (boy(x) & love(x, y))', 'exists z1. boy(z1)', 'exists x. (boy(x) & -(x = adam))', 'exists x. (boy(x) & all y. love(y, x))', 'all x. (boy(x) | girl(x))', 'all x. (girl(x) -> exists y. boy(y) & love(x, y))', #Every girl loves exists boy. 'exists x. (boy(x) & all y. (girl(y) -> love(y, x)))', #There is exists boy that every girl loves. 'exists x. (boy(x) & all y. (girl(y) -> love(x, y)))', #exists boy loves every girl. 'all x. (dog(x) -> - girl(x))', 'exists x. exists y. (love(x, y) & love(x, y))' ] for fmla in formulas: g2.purge() if trace: m2.evaluate(fmla, g2, trace) else: print("The value of '%s' is: %s" % (fmla, m2.evaluate(fmla, g2))) # Demo 3: Satisfaction ############# def satdemo(trace=None): """Satisfiers of an open formula in a first order model.""" print() print('*' * mult) print("Satisfiers Demo") print('*' * mult) folmodel(quiet=True) formulas = [ 'boy(x)', '(x = x)', '(boy(x) | girl(x))', '(boy(x) & girl(x))', 'love(adam, x)', 'love(x, adam)', '-(x = adam)', 'exists z22. love(x, z22)', 'exists y. love(y, x)', 'all y. (girl(y) -> love(x, y))', 'all y. (girl(y) -> love(y, x))', 'all y. (girl(y) -> (boy(x) & love(y, x)))', '(boy(x) & all y. (girl(y) -> love(x, y)))', '(boy(x) & all y. (girl(y) -> love(y, x)))', '(boy(x) & exists y. (girl(y) & love(y, x)))', '(girl(x) -> dog(x))', 'all y. (dog(y) -> (x = y))', 'exists y. love(y, x)', 'exists y. (love(adam, y) & love(y, x))' ] if trace: print(m2) for fmla in formulas: print(fmla) Expression.fromstring(fmla) parsed = [Expression.fromstring(fmla) for fmla in formulas] for p in parsed: g2.purge() print("The satisfiers of '%s' are: %s" % (p, m2.satisfiers(p, 'x', g2, trace))) def demo(num=0, trace=None): """ Run exists demos. - num = 1: propositional logic demo - num = 2: first order model demo (only if trace is set) - num = 3: first order sentences demo - num = 4: satisfaction of open formulas demo - any other value: run all the demos :param trace: trace = 1, or trace = 2 for more verbose tracing """ demos = { 1: propdemo, 2: folmodel, 3: foldemo, 4: satdemo} try: demos[num](trace=trace) except KeyError: for num in demos: demos[num](trace=trace) if __name__ == "__main__": demo(2, trace=0) nltk-3.1/nltk/sem/glue.py0000644000076500000240000006636412607224144015137 0ustar sbstaff00000000000000# Natural Language Toolkit: Glue Semantics # # Author: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import print_function, division, unicode_literals import os import nltk from nltk.internals import Counter from nltk.compat import string_types from nltk.tag import UnigramTagger, BigramTagger, TrigramTagger, RegexpTagger from nltk.sem.logic import (Expression, Variable, VariableExpression, LambdaExpression, AbstractVariableExpression) from nltk.compat import python_2_unicode_compatible from nltk.sem import drt from nltk.sem import linearlogic SPEC_SEMTYPES = {'a' : 'ex_quant', 'an' : 'ex_quant', 'every' : 'univ_quant', 'the' : 'def_art', 'no' : 'no_quant', 'default' : 'ex_quant'} OPTIONAL_RELATIONSHIPS = ['nmod', 'vmod', 'punct'] @python_2_unicode_compatible class GlueFormula(object): def __init__(self, meaning, glue, indices=None): if not indices: indices = set() if isinstance(meaning, string_types): self.meaning = Expression.fromstring(meaning) elif isinstance(meaning, Expression): self.meaning = meaning else: raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__)) if isinstance(glue, string_types): self.glue = linearlogic.LinearLogicParser().parse(glue) elif isinstance(glue, linearlogic.Expression): self.glue = glue else: raise RuntimeError('Glue term neither string or expression: %s, %s' % (glue, glue.__class__)) self.indices = indices def applyto(self, arg): """ self = (\\x.(walk x), (subj -o f)) arg = (john , subj) returns ((walk john), f) """ if self.indices & arg.indices: # if the sets are NOT disjoint raise linearlogic.LinearLogicApplicationException("'%s' applied to '%s'. Indices are not disjoint." % (self, arg)) else: # if the sets ARE disjoint return_indices = (self.indices | arg.indices) try: return_glue = linearlogic.ApplicationExpression(self.glue, arg.glue, arg.indices) except linearlogic.LinearLogicApplicationException: raise linearlogic.LinearLogicApplicationException("'%s' applied to '%s'" % (self.simplify(), arg.simplify())) arg_meaning_abstracted = arg.meaning if return_indices: for dep in self.glue.simplify().antecedent.dependencies[::-1]: # if self.glue is (A -o B), dep is in A.dependencies arg_meaning_abstracted = self.make_LambdaExpression(Variable('v%s' % dep), arg_meaning_abstracted) return_meaning = self.meaning.applyto(arg_meaning_abstracted) return self.__class__(return_meaning, return_glue, return_indices) def make_VariableExpression(self, name): return VariableExpression(name) def make_LambdaExpression(self, variable, term): return LambdaExpression(variable, term) def lambda_abstract(self, other): assert isinstance(other, GlueFormula) assert isinstance(other.meaning, AbstractVariableExpression) return self.__class__(self.make_LambdaExpression(other.meaning.variable, self.meaning), linearlogic.ImpExpression(other.glue, self.glue)) def compile(self, counter=None): """From Iddo Lev's PhD Dissertation p108-109""" if not counter: counter = Counter() (compiled_glue, new_forms) = self.glue.simplify().compile_pos(counter, self.__class__) return new_forms + [self.__class__(self.meaning, compiled_glue, set([counter.get()]))] def simplify(self): return self.__class__(self.meaning.simplify(), self.glue.simplify(), self.indices) def __eq__(self, other): return self.__class__ == other.__class__ and self.meaning == other.meaning and self.glue == other.glue def __ne__(self, other): return not self == other def __str__(self): assert isinstance(self.indices, set) accum = '%s : %s' % (self.meaning, self.glue) if self.indices: accum += ' : {' + ', '.join(str(index) for index in self.indices) + '}' return accum def __repr__(self): return "%s" % self @python_2_unicode_compatible class GlueDict(dict): def __init__(self, filename, encoding=None): self.filename = filename self.file_encoding = encoding self.read_file() def read_file(self, empty_first=True): if empty_first: self.clear() try: contents = nltk.data.load(self.filename, format='text', encoding=self.file_encoding) # TODO: the above can't handle zip files, but this should anyway be fixed in nltk.data.load() except LookupError as e: try: contents = nltk.data.load('file:' + self.filename, format='text', encoding=self.file_encoding) except LookupError: raise e lines = contents.splitlines() for line in lines: # example: 'n : (\\x.( x), (v-or))' # lambdacalc -^ linear logic -^ line = line.strip() # remove trailing newline if not len(line): continue # skip empty lines if line[0] == '#': continue # skip commented out lines parts = line.split(' : ', 2) # ['verb', '(\\x.( x), ( subj -o f ))', '[subj]'] glue_formulas = [] paren_count = 0 tuple_start = 0 tuple_comma = 0 relationships = None if len(parts) > 1: for (i, c) in enumerate(parts[1]): if c == '(': if paren_count == 0: # if it's the first '(' of a tuple tuple_start = i+1 # then save the index paren_count += 1 elif c == ')': paren_count -= 1 if paren_count == 0: # if it's the last ')' of a tuple meaning_term = parts[1][tuple_start:tuple_comma] # '\\x.( x)' glue_term = parts[1][tuple_comma+1:i] # '(v-r)' glue_formulas.append([meaning_term, glue_term]) # add the GlueFormula to the list elif c == ',': if paren_count == 1: # if it's a comma separating the parts of the tuple tuple_comma = i # then save the index elif c == '#': # skip comments at the ends of lines if paren_count != 0: # if the line hasn't parsed correctly so far raise RuntimeError('Formula syntax is incorrect for entry ' + line) break # break to the next line if len(parts) > 2: #if there is a relationship entry at the end rel_start = parts[2].index('[')+1 rel_end = parts[2].index(']') if rel_start == rel_end: relationships = frozenset() else: relationships = frozenset(r.strip() for r in parts[2][rel_start:rel_end].split(',')) try: start_inheritance = parts[0].index('(') end_inheritance = parts[0].index(')') sem = parts[0][:start_inheritance].strip() supertype = parts[0][start_inheritance+1:end_inheritance] except: sem = parts[0].strip() supertype = None if sem not in self: self[sem] = {} if relationships is None: #if not specified for a specific relationship set #add all relationship entries for parents if supertype: for rels in self[supertype]: if rels not in self[sem]: self[sem][rels] = [] glue = self[supertype][rels] self[sem][rels].extend(glue) self[sem][rels].extend(glue_formulas) # add the glue formulas to every rel entry else: if None not in self[sem]: self[sem][None] = [] self[sem][None].extend(glue_formulas) # add the glue formulas to every rel entry else: if relationships not in self[sem]: self[sem][relationships] = [] if supertype: self[sem][relationships].extend(self[supertype][relationships]) self[sem][relationships].extend(glue_formulas) # add the glue entry to the dictionary def __str__(self): accum = '' for pos in self: str_pos = "%s" % pos for relset in self[pos]: i = 1 for gf in self[pos][relset]: if i == 1: accum += str_pos + ': ' else: accum += ' '*(len(str_pos)+2) accum += "%s" % gf if relset and i == len(self[pos][relset]): accum += ' : %s' % relset accum += '\n' i += 1 return accum def to_glueformula_list(self, depgraph, node=None, counter=None, verbose=False): if node is None: # TODO: should it be depgraph.root? Is this code tested? top = depgraph.nodes[0] depList = sum(list(top['deps'].values()), []) root = depgraph.nodes[depList[0]] return self.to_glueformula_list(depgraph, root, Counter(), verbose) glueformulas = self.lookup(node, depgraph, counter) for dep_idx in sum(list(node['deps'].values()), []): dep = depgraph.nodes[dep_idx] glueformulas.extend(self.to_glueformula_list(depgraph, dep, counter, verbose)) return glueformulas def lookup(self, node, depgraph, counter): semtype_names = self.get_semtypes(node) semtype = None for name in semtype_names: if name in self: semtype = self[name] break if semtype is None: # raise KeyError, "There is no GlueDict entry for sem type '%s' (for '%s')" % (sem, word) return [] self.add_missing_dependencies(node, depgraph) lookup = self._lookup_semtype_option(semtype, node, depgraph) if not len(lookup): raise KeyError( "There is no GlueDict entry for sem type of '%s' " "with tag '%s', and rel '%s'" % (node['word'], node['tag'], node['rel']) ) return self.get_glueformulas_from_semtype_entry(lookup, node['word'], node, depgraph, counter) def add_missing_dependencies(self, node, depgraph): rel = node['rel'].lower() if rel == 'main': headnode = depgraph.nodes[node['head']] subj = self.lookup_unique('subj', headnode, depgraph) relation = subj['rel'] node['deps'].setdefault(relation,[]) node['deps'][relation].append(subj['address']) #node['deps'].append(subj['address']) def _lookup_semtype_option(self, semtype, node, depgraph): relationships = frozenset( depgraph.nodes[dep]['rel'].lower() for dep in sum(list(node['deps'].values()), []) if depgraph.nodes[dep]['rel'].lower() not in OPTIONAL_RELATIONSHIPS ) try: lookup = semtype[relationships] except KeyError: # An exact match is not found, so find the best match where # 'best' is defined as the glue entry whose relationship set has the # most relations of any possible relationship set that is a subset # of the actual depgraph best_match = frozenset() for relset_option in set(semtype)-set([None]): if len(relset_option) > len(best_match) and \ relset_option < relationships: best_match = relset_option if not best_match: if None in semtype: best_match = None else: return None lookup = semtype[best_match] return lookup def get_semtypes(self, node): """ Based on the node, return a list of plausible semtypes in order of plausibility. """ rel = node['rel'].lower() word = node['word'].lower() if rel == 'spec': if word in SPEC_SEMTYPES: return [SPEC_SEMTYPES[word]] else: return [SPEC_SEMTYPES['default']] elif rel in ['nmod', 'vmod']: return [node['tag'], rel] else: return [node['tag']] def get_glueformulas_from_semtype_entry(self, lookup, word, node, depgraph, counter): glueformulas = [] glueFormulaFactory = self.get_GlueFormula_factory() for meaning, glue in lookup: gf = glueFormulaFactory(self.get_meaning_formula(meaning, word), glue) if not len(glueformulas): gf.word = word else: gf.word = '%s%s' % (word, len(glueformulas)+1) gf.glue = self.initialize_labels(gf.glue, node, depgraph, counter.get()) glueformulas.append(gf) return glueformulas def get_meaning_formula(self, generic, word): """ :param generic: A meaning formula string containing the parameter "" :param word: The actual word to be replace "" """ word = word.replace('.', '') return generic.replace('', word) def initialize_labels(self, expr, node, depgraph, unique_index): if isinstance(expr, linearlogic.AtomicExpression): name = self.find_label_name(expr.name.lower(), node, depgraph, unique_index) if name[0].isupper(): return linearlogic.VariableExpression(name) else: return linearlogic.ConstantExpression(name) else: return linearlogic.ImpExpression( self.initialize_labels(expr.antecedent, node, depgraph, unique_index), self.initialize_labels(expr.consequent, node, depgraph, unique_index) ) def find_label_name(self, name, node, depgraph, unique_index): try: dot = name.index('.') before_dot = name[:dot] after_dot = name[dot+1:] if before_dot == 'super': return self.find_label_name(after_dot, depgraph.nodes[node['head']], depgraph, unique_index) else: return self.find_label_name(after_dot, self.lookup_unique(before_dot, node, depgraph), depgraph, unique_index) except ValueError: lbl = self.get_label(node) if name == 'f': return lbl elif name == 'v': return '%sv' % lbl elif name == 'r': return '%sr' % lbl elif name == 'super': return self.get_label(depgraph.nodes[node['head']]) elif name == 'var': return '%s%s' % (lbl.upper(), unique_index) elif name == 'a': return self.get_label(self.lookup_unique('conja', node, depgraph)) elif name == 'b': return self.get_label(self.lookup_unique('conjb', node, depgraph)) else: return self.get_label(self.lookup_unique(name, node, depgraph)) def get_label(self, node): """ Pick an alphabetic character as identifier for an entity in the model. :param value: where to index into the list of characters :type value: int """ value = node['address'] letter = ['f','g','h','i','j','k','l','m','n','o','p','q','r','s', 't','u','v','w','x','y','z','a','b','c','d','e'][value-1] num = int(value) // 26 if num > 0: return letter + str(num) else: return letter def lookup_unique(self, rel, node, depgraph): """ Lookup 'key'. There should be exactly one item in the associated relation. """ deps = [ depgraph.nodes[dep] for dep in sum(list(node['deps'].values()), []) if depgraph.nodes[dep]['rel'].lower() == rel.lower() ] if len(deps) == 0: raise KeyError("'%s' doesn't contain a feature '%s'" % (node['word'], rel)) elif len(deps) > 1: raise KeyError("'%s' should only have one feature '%s'" % (node['word'], rel)) else: return deps[0] def get_GlueFormula_factory(self): return GlueFormula class Glue(object): def __init__(self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False): self.verbose = verbose self.remove_duplicates = remove_duplicates self.depparser = depparser from nltk import Prover9 self.prover = Prover9() if semtype_file: self.semtype_file = semtype_file else: self.semtype_file = os.path.join('grammars', 'sample_grammars','glue.semtype') def train_depparser(self, depgraphs=None): if depgraphs: self.depparser.train(depgraphs) else: self.depparser.train_from_file(nltk.data.find( os.path.join('grammars', 'sample_grammars', 'glue_train.conll'))) def parse_to_meaning(self, sentence): readings = [] for agenda in self.parse_to_compiled(sentence): readings.extend(self.get_readings(agenda)) return readings def get_readings(self, agenda): readings = [] agenda_length = len(agenda) atomics = dict() nonatomics = dict() while agenda: # is not empty cur = agenda.pop() glue_simp = cur.glue.simplify() if isinstance(glue_simp, linearlogic.ImpExpression): # if cur.glue is non-atomic for key in atomics: try: if isinstance(cur.glue, linearlogic.ApplicationExpression): bindings = cur.glue.bindings else: bindings = linearlogic.BindingDict() glue_simp.antecedent.unify(key, bindings) for atomic in atomics[key]: if not (cur.indices & atomic.indices): # if the sets of indices are disjoint try: agenda.append(cur.applyto(atomic)) except linearlogic.LinearLogicApplicationException: pass except linearlogic.UnificationException: pass try: nonatomics[glue_simp.antecedent].append(cur) except KeyError: nonatomics[glue_simp.antecedent] = [cur] else: # else cur.glue is atomic for key in nonatomics: for nonatomic in nonatomics[key]: try: if isinstance(nonatomic.glue, linearlogic.ApplicationExpression): bindings = nonatomic.glue.bindings else: bindings = linearlogic.BindingDict() glue_simp.unify(key, bindings) if not (cur.indices & nonatomic.indices): # if the sets of indices are disjoint try: agenda.append(nonatomic.applyto(cur)) except linearlogic.LinearLogicApplicationException: pass except linearlogic.UnificationException: pass try: atomics[glue_simp].append(cur) except KeyError: atomics[glue_simp] = [cur] for entry in atomics: for gf in atomics[entry]: if len(gf.indices) == agenda_length: self._add_to_reading_list(gf, readings) for entry in nonatomics: for gf in nonatomics[entry]: if len(gf.indices) == agenda_length: self._add_to_reading_list(gf, readings) return readings def _add_to_reading_list(self, glueformula, reading_list): add_reading = True if self.remove_duplicates: for reading in reading_list: try: if reading.equiv(glueformula.meaning, self.prover): add_reading = False break except Exception as e: #if there is an exception, the syntax of the formula #may not be understandable by the prover, so don't #throw out the reading. print('Error when checking logical equality of statements', e) pass if add_reading: reading_list.append(glueformula.meaning) def parse_to_compiled(self, sentence): gfls = [self.depgraph_to_glue(dg) for dg in self.dep_parse(sentence)] return [self.gfl_to_compiled(gfl) for gfl in gfls] def dep_parse(self, sentence): """ Return a dependency graph for the sentence. :param sentence: the sentence to be parsed :type sentence: list(str) :rtype: DependencyGraph """ #Lazy-initialize the depparser if self.depparser is None: from nltk.parse import MaltParser self.depparser = MaltParser(tagger=self.get_pos_tagger()) if not self.depparser._trained: self.train_depparser() return self.depparser.parse(sentence, verbose=self.verbose) def depgraph_to_glue(self, depgraph): return self.get_glue_dict().to_glueformula_list(depgraph) def get_glue_dict(self): return GlueDict(self.semtype_file) def gfl_to_compiled(self, gfl): index_counter = Counter() return_list = [] for gf in gfl: return_list.extend(gf.compile(index_counter)) if self.verbose: print('Compiled Glue Premises:') for cgf in return_list: print(cgf) return return_list def get_pos_tagger(self): from nltk.corpus import brown regexp_tagger = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) brown_train = brown.tagged_sents(categories='news') unigram_tagger = UnigramTagger(brown_train, backoff=regexp_tagger) bigram_tagger = BigramTagger(brown_train, backoff=unigram_tagger) trigram_tagger = TrigramTagger(brown_train, backoff=bigram_tagger) #Override particular words main_tagger = RegexpTagger( [(r'(A|a|An|an)$', 'ex_quant'), (r'(Every|every|All|all)$', 'univ_quant') ], backoff=trigram_tagger) return main_tagger class DrtGlueFormula(GlueFormula): def __init__(self, meaning, glue, indices=None): if not indices: indices = set() if isinstance(meaning, string_types): self.meaning = drt.DrtExpression.fromstring(meaning) elif isinstance(meaning, drt.DrtExpression): self.meaning = meaning else: raise RuntimeError('Meaning term neither string or expression: %s, %s' % (meaning, meaning.__class__)) if isinstance(glue, string_types): self.glue = linearlogic.LinearLogicParser().parse(glue) elif isinstance(glue, linearlogic.Expression): self.glue = glue else: raise RuntimeError('Glue term neither string or expression: %s, %s' % (glue, glue.__class__)) self.indices = indices def make_VariableExpression(self, name): return drt.DrtVariableExpression(name) def make_LambdaExpression(self, variable, term): return drt.DrtLambdaExpression(variable, term) class DrtGlueDict(GlueDict): def get_GlueFormula_factory(self): return DrtGlueFormula class DrtGlue(Glue): def __init__(self, semtype_file=None, remove_duplicates=False, depparser=None, verbose=False): if not semtype_file: semtype_file = os.path.join('grammars', 'sample_grammars','drt_glue.semtype') Glue.__init__(self, semtype_file, remove_duplicates, depparser, verbose) def get_glue_dict(self): return DrtGlueDict(self.semtype_file) def demo(show_example=-1): from nltk.parse import MaltParser examples = ['David sees Mary', 'David eats a sandwich', 'every man chases a dog', 'every man believes a dog sleeps', 'John gives David a sandwich', 'John chases himself'] # 'John persuades David to order a pizza', # 'John tries to go', # 'John tries to find a unicorn', # 'John seems to vanish', # 'a unicorn seems to approach', # 'every big cat leaves', # 'every gray cat leaves', # 'every big gray cat leaves', # 'a former senator leaves', print('============== DEMO ==============') tagger = RegexpTagger( [('^(David|Mary|John)$', 'NNP'), ('^(sees|eats|chases|believes|gives|sleeps|chases|persuades|tries|seems|leaves)$', 'VB'), ('^(go|order|vanish|find|approach)$', 'VB'), ('^(a)$', 'ex_quant'), ('^(every)$', 'univ_quant'), ('^(sandwich|man|dog|pizza|unicorn|cat|senator)$', 'NN'), ('^(big|gray|former)$', 'JJ'), ('^(him|himself)$', 'PRP') ]) depparser = MaltParser(tagger=tagger) glue = Glue(depparser=depparser, verbose=False) for (i, sentence) in enumerate(examples): if i==show_example or show_example==-1: print('[[[Example %s]]] %s' % (i, sentence)) for reading in glue.parse_to_meaning(sentence.split()): print(reading.simplify()) print('') if __name__ == '__main__': demo() nltk-3.1/nltk/sem/hole.py0000644000076500000240000003316612607224144015124 0ustar sbstaff00000000000000# Natural Language Toolkit: Logic # # Author: Peter Wang # Updated by: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT """ An implementation of the Hole Semantics model, following Blackburn and Bos, Representation and Inference for Natural Language (CSLI, 2005). The semantic representations are built by the grammar hole.fcfg. This module contains driver code to read in sentences and parse them according to a hole semantics grammar. After parsing, the semantic representation is in the form of an underspecified representation that is not easy to read. We use a "plugging" algorithm to convert that representation into first-order logic formulas. """ from __future__ import print_function, unicode_literals from functools import reduce from nltk import compat from nltk.parse import load_parser from nltk.sem.skolemize import skolemize from nltk.sem.logic import (AllExpression, AndExpression, ApplicationExpression, ExistsExpression, IffExpression, ImpExpression, LambdaExpression, NegatedExpression, OrExpression) # Note that in this code there may be multiple types of trees being referred to: # # 1. parse trees # 2. the underspecified representation # 3. first-order logic formula trees # 4. the search space when plugging (search tree) # class Constants(object): ALL = 'ALL' EXISTS = 'EXISTS' NOT = 'NOT' AND = 'AND' OR = 'OR' IMP = 'IMP' IFF = 'IFF' PRED = 'PRED' LEQ = 'LEQ' HOLE = 'HOLE' LABEL = 'LABEL' MAP = {ALL: lambda v, e: AllExpression(v.variable, e), EXISTS: lambda v, e: ExistsExpression(v.variable, e), NOT: NegatedExpression, AND: AndExpression, OR: OrExpression, IMP: ImpExpression, IFF: IffExpression, PRED: ApplicationExpression} class HoleSemantics(object): """ This class holds the broken-down components of a hole semantics, i.e. it extracts the holes, labels, logic formula fragments and constraints out of a big conjunction of such as produced by the hole semantics grammar. It then provides some operations on the semantics dealing with holes, labels and finding legal ways to plug holes with labels. """ def __init__(self, usr): """ Constructor. `usr' is a ``sem.Expression`` representing an Underspecified Representation Structure (USR). A USR has the following special predicates: ALL(l,v,n), EXISTS(l,v,n), AND(l,n,n), OR(l,n,n), IMP(l,n,n), IFF(l,n,n), PRED(l,v,n,v[,v]*) where the brackets and star indicate zero or more repetitions, LEQ(n,n), HOLE(n), LABEL(n) where l is the label of the node described by the predicate, n is either a label or a hole, and v is a variable. """ self.holes = set() self.labels = set() self.fragments = {} # mapping of label -> formula fragment self.constraints = set() # set of Constraints self._break_down(usr) self.top_most_labels = self._find_top_most_labels() self.top_hole = self._find_top_hole() def is_node(self, x): """ Return true if x is a node (label or hole) in this semantic representation. """ return x in (self.labels | self.holes) def _break_down(self, usr): """ Extract holes, labels, formula fragments and constraints from the hole semantics underspecified representation (USR). """ if isinstance(usr, AndExpression): self._break_down(usr.first) self._break_down(usr.second) elif isinstance(usr, ApplicationExpression): func, args = usr.uncurry() if func.variable.name == Constants.LEQ: self.constraints.add(Constraint(args[0], args[1])) elif func.variable.name == Constants.HOLE: self.holes.add(args[0]) elif func.variable.name == Constants.LABEL: self.labels.add(args[0]) else: label = args[0] assert label not in self.fragments self.fragments[label] = (func, args[1:]) else: raise ValueError(usr.label()) def _find_top_nodes(self, node_list): top_nodes = node_list.copy() for f in compat.itervalues(self.fragments): # the label is the first argument of the predicate args = f[1] for arg in args: if arg in node_list: top_nodes.discard(arg) return top_nodes def _find_top_most_labels(self): """ Return the set of labels which are not referenced directly as part of another formula fragment. These will be the top-most labels for the subtree that they are part of. """ return self._find_top_nodes(self.labels) def _find_top_hole(self): """ Return the hole that will be the top of the formula tree. """ top_holes = self._find_top_nodes(self.holes) assert len(top_holes) == 1 # it must be unique return top_holes.pop() def pluggings(self): """ Calculate and return all the legal pluggings (mappings of labels to holes) of this semantics given the constraints. """ record = [] self._plug_nodes([(self.top_hole, [])], self.top_most_labels, {}, record) return record def _plug_nodes(self, queue, potential_labels, plug_acc, record): """ Plug the nodes in `queue' with the labels in `potential_labels'. Each element of `queue' is a tuple of the node to plug and the list of ancestor holes from the root of the graph to that node. `potential_labels' is a set of the labels which are still available for plugging. `plug_acc' is the incomplete mapping of holes to labels made on the current branch of the search tree so far. `record' is a list of all the complete pluggings that we have found in total so far. It is the only parameter that is destructively updated. """ if queue != []: (node, ancestors) = queue[0] if node in self.holes: # The node is a hole, try to plug it. self._plug_hole(node, ancestors, queue[1:], potential_labels, plug_acc, record) else: assert node in self.labels # The node is a label. Replace it in the queue by the holes and # labels in the formula fragment named by that label. args = self.fragments[node][1] head = [(a, ancestors) for a in args if self.is_node(a)] self._plug_nodes(head + queue[1:], potential_labels, plug_acc, record) else: raise Exception('queue empty') def _plug_hole(self, hole, ancestors0, queue, potential_labels0, plug_acc0, record): """ Try all possible ways of plugging a single hole. See _plug_nodes for the meanings of the parameters. """ # Add the current hole we're trying to plug into the list of ancestors. assert hole not in ancestors0 ancestors = [hole] + ancestors0 # Try each potential label in this hole in turn. for l in potential_labels0: # Is the label valid in this hole? if self._violates_constraints(l, ancestors): continue plug_acc = plug_acc0.copy() plug_acc[hole] = l potential_labels = potential_labels0.copy() potential_labels.remove(l) if len(potential_labels) == 0: # No more potential labels. That must mean all the holes have # been filled so we have found a legal plugging so remember it. # # Note that the queue might not be empty because there might # be labels on there that point to formula fragments with # no holes in them. _sanity_check_plugging will make sure # all holes are filled. self._sanity_check_plugging(plug_acc, self.top_hole, []) record.append(plug_acc) else: # Recursively try to fill in the rest of the holes in the # queue. The label we just plugged into the hole could have # holes of its own so at the end of the queue. Putting it on # the end of the queue gives us a breadth-first search, so that # all the holes at level i of the formula tree are filled # before filling level i+1. # A depth-first search would work as well since the trees must # be finite but the bookkeeping would be harder. self._plug_nodes(queue + [(l, ancestors)], potential_labels, plug_acc, record) def _violates_constraints(self, label, ancestors): """ Return True if the `label' cannot be placed underneath the holes given by the set `ancestors' because it would violate the constraints imposed on it. """ for c in self.constraints: if c.lhs == label: if c.rhs not in ancestors: return True return False def _sanity_check_plugging(self, plugging, node, ancestors): """ Make sure that a given plugging is legal. We recursively go through each node and make sure that no constraints are violated. We also check that all holes have been filled. """ if node in self.holes: ancestors = [node] + ancestors label = plugging[node] else: label = node assert label in self.labels for c in self.constraints: if c.lhs == label: assert c.rhs in ancestors args = self.fragments[label][1] for arg in args: if self.is_node(arg): self._sanity_check_plugging(plugging, arg, [label] + ancestors) def formula_tree(self, plugging): """ Return the first-order logic formula tree for this underspecified representation using the plugging given. """ return self._formula_tree(plugging, self.top_hole) def _formula_tree(self, plugging, node): if node in plugging: return self._formula_tree(plugging, plugging[node]) elif node in self.fragments: pred, args = self.fragments[node] children = [self._formula_tree(plugging, arg) for arg in args] return reduce(Constants.MAP[pred.variable.name], children) else: return node @compat.python_2_unicode_compatible class Constraint(object): """ This class represents a constraint of the form (L =< N), where L is a label and N is a node (a label or a hole). """ def __init__(self, lhs, rhs): self.lhs = lhs self.rhs = rhs def __eq__(self, other): if self.__class__ == other.__class__: return self.lhs == other.lhs and self.rhs == other.rhs else: return False def __ne__(self, other): return not (self == other) def __hash__(self): return hash(repr(self)) def __repr__(self): return '(%s < %s)' % (self.lhs, self.rhs) def hole_readings(sentence, grammar_filename=None, verbose=False): if not grammar_filename: grammar_filename = 'grammars/sample_grammars/hole.fcfg' if verbose: print('Reading grammar file', grammar_filename) parser = load_parser(grammar_filename) # Parse the sentence. tokens = sentence.split() trees = list(parser.parse(tokens)) if verbose: print('Got %d different parses' % len(trees)) all_readings = [] for tree in trees: # Get the semantic feature from the top of the parse tree. sem = tree.label()['SEM'].simplify() # Print the raw semantic representation. if verbose: print('Raw: ', sem) # Skolemize away all quantifiers. All variables become unique. while isinstance(sem, LambdaExpression): sem = sem.term skolemized = skolemize(sem) if verbose: print('Skolemized:', skolemized) # Break the hole semantics representation down into its components # i.e. holes, labels, formula fragments and constraints. hole_sem = HoleSemantics(skolemized) # Maybe show the details of the semantic representation. if verbose: print('Holes: ', hole_sem.holes) print('Labels: ', hole_sem.labels) print('Constraints: ', hole_sem.constraints) print('Top hole: ', hole_sem.top_hole) print('Top labels: ', hole_sem.top_most_labels) print('Fragments:') for l, f in hole_sem.fragments.items(): print('\t%s: %s' % (l, f)) # Find all the possible ways to plug the formulas together. pluggings = hole_sem.pluggings() # Build FOL formula trees using the pluggings. readings = list(map(hole_sem.formula_tree, pluggings)) # Print out the formulas in a textual format. if verbose: for i, r in enumerate(readings): print() print('%d. %s' % (i, r)) print() all_readings.extend(readings) return all_readings if __name__ == '__main__': for r in hole_readings('a dog barks'): print(r) print() for r in hole_readings('every girl chases a dog'): print(r) nltk-3.1/nltk/sem/lfg.py0000644000076500000240000001514312607224144014740 0ustar sbstaff00000000000000# Natural Language Toolkit: Lexical Functional Grammar # # Author: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import print_function, division, unicode_literals from nltk.internals import Counter from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class FStructure(dict): def safeappend(self, key, item): """ Append 'item' to the list at 'key'. If no list exists for 'key', then construct one. """ if key not in self: self[key] = [] self[key].append(item) def __setitem__(self, key, value): dict.__setitem__(self, key.lower(), value) def __getitem__(self, key): return dict.__getitem__(self, key.lower()) def __contains__(self, key): return dict.__contains__(self, key.lower()) def to_glueformula_list(self, glue_dict): depgraph = self.to_depgraph() return glue_dict.to_glueformula_list(depgraph) def to_depgraph(self, rel=None): from nltk.parse.dependencygraph import DependencyGraph depgraph = DependencyGraph() nodes = depgraph.nodes self._to_depgraph(nodes, 0, 'ROOT') # Add all the dependencies for all the nodes for address, node in nodes.items(): for n2 in (n for n in nodes.values() if n['rel'] != 'TOP'): if n2['head'] == address: relation = n2['rel'] node['deps'].setdefault(relation,[]) node['deps'][relation].append(n2['address']) depgraph.root = nodes[1] return depgraph def _to_depgraph(self, nodes, head, rel): index = len(nodes) nodes[index].update( { 'address': index, 'word': self.pred[0], 'tag': self.pred[1], 'head': head, 'rel': rel, } ) for feature in sorted(self): for item in sorted(self[feature]): if isinstance(item, FStructure): item._to_depgraph(nodes, index, feature) elif isinstance(item, tuple): new_index = len(nodes) nodes[new_index].update( { 'address': new_index, 'word': item[0], 'tag': item[1], 'head': index, 'rel': feature, } ) elif isinstance(item, list): for n in item: n._to_depgraph(nodes, index, feature) else: raise Exception('feature %s is not an FStruct, a list, or a tuple' % feature) @staticmethod def read_depgraph(depgraph): return FStructure._read_depgraph(depgraph.root, depgraph) @staticmethod def _read_depgraph(node, depgraph, label_counter=None, parent=None): if not label_counter: label_counter = Counter() if node['rel'].lower() in ['spec', 'punct']: # the value of a 'spec' entry is a word, not an FStructure return (node['word'], node['tag']) else: fstruct = FStructure() fstruct.pred = None fstruct.label = FStructure._make_label(label_counter.get()) fstruct.parent = parent word, tag = node['word'], node['tag'] if tag[:2] == 'VB': if tag[2:3] == 'D': fstruct.safeappend('tense', ('PAST', 'tense')) fstruct.pred = (word, tag[:2]) if not fstruct.pred: fstruct.pred = (word, tag) children = [depgraph.nodes[idx] for idx in sum(list(node['deps'].values()), [])] for child in children: fstruct.safeappend(child['rel'], FStructure._read_depgraph(child, depgraph, label_counter, fstruct)) return fstruct @staticmethod def _make_label(value): """ Pick an alphabetic character as identifier for an entity in the model. :param value: where to index into the list of characters :type value: int """ letter = ['f','g','h','i','j','k','l','m','n','o','p','q','r','s', 't','u','v','w','x','y','z','a','b','c','d','e'][value-1] num = int(value) // 26 if num > 0: return letter + str(num) else: return letter def __repr__(self): return self.__unicode__().replace('\n', '') def __str__(self): return self.pretty_format() def pretty_format(self, indent=3): try: accum = '%s:[' % self.label except NameError: accum = '[' try: accum += 'pred \'%s\'' % (self.pred[0]) except NameError: pass for feature in sorted(self): for item in self[feature]: if isinstance(item, FStructure): next_indent = indent+len(feature)+3+len(self.label) accum += '\n%s%s %s' % (' '*(indent), feature, item.pretty_format(next_indent)) elif isinstance(item, tuple): accum += '\n%s%s \'%s\'' % (' '*(indent), feature, item[0]) elif isinstance(item, list): accum += '\n%s%s {%s}' % (' '*(indent), feature, ('\n%s' % (' '*(indent+len(feature)+2))).join(item)) else: # ERROR raise Exception('feature %s is not an FStruct, a list, or a tuple' % feature) return accum+']' def demo_read_depgraph(): from nltk.parse.dependencygraph import DependencyGraph dg1 = DependencyGraph("""\ Esso NNP 2 SUB said VBD 0 ROOT the DT 5 NMOD Whiting NNP 5 NMOD field NN 6 SUB started VBD 2 VMOD production NN 6 OBJ Tuesday NNP 6 VMOD """) dg2 = DependencyGraph("""\ John NNP 2 SUB sees VBP 0 ROOT Mary NNP 2 OBJ """) dg3 = DependencyGraph("""\ a DT 2 SPEC man NN 3 SUBJ walks VB 0 ROOT """) dg4 = DependencyGraph("""\ every DT 2 SPEC girl NN 3 SUBJ chases VB 0 ROOT a DT 5 SPEC dog NN 3 OBJ """) depgraphs = [dg1,dg2,dg3,dg4] for dg in depgraphs: print(FStructure.read_depgraph(dg)) if __name__ == '__main__': demo_read_depgraph() nltk-3.1/nltk/sem/linearlogic.py0000644000076500000240000004035112607224144016457 0ustar sbstaff00000000000000# Natural Language Toolkit: Linear Logic # # Author: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals from nltk.internals import Counter from nltk.compat import string_types, python_2_unicode_compatible from nltk.sem.logic import LogicParser, APP _counter = Counter() class Tokens(object): #Punctuation OPEN = '(' CLOSE = ')' #Operations IMP = '-o' PUNCT = [OPEN, CLOSE] TOKENS = PUNCT + [IMP] class LinearLogicParser(LogicParser): """A linear logic expression parser.""" def __init__(self): LogicParser.__init__(self) self.operator_precedence = {APP: 1, Tokens.IMP: 2, None: 3} self.right_associated_operations += [Tokens.IMP] def get_all_symbols(self): return Tokens.TOKENS def handle(self, tok, context): if tok not in Tokens.TOKENS: return self.handle_variable(tok, context) elif tok == Tokens.OPEN: return self.handle_open(tok, context) def get_BooleanExpression_factory(self, tok): if tok == Tokens.IMP: return ImpExpression else: return None def make_BooleanExpression(self, factory, first, second): return factory(first, second) def attempt_ApplicationExpression(self, expression, context): """Attempt to make an application expression. If the next tokens are an argument in parens, then the argument expression is a function being applied to the arguments. Otherwise, return the argument expression.""" if self.has_priority(APP, context): if self.inRange(0) and self.token(0) == Tokens.OPEN: self.token() #swallow then open paren argument = self.process_next_expression(APP) self.assertNextToken(Tokens.CLOSE) expression = ApplicationExpression(expression, argument, None) return expression def make_VariableExpression(self, name): if name[0].isupper(): return VariableExpression(name) else: return ConstantExpression(name) @python_2_unicode_compatible class Expression(object): _linear_logic_parser = LinearLogicParser() @classmethod def fromstring(cls, s): return cls._linear_logic_parser.parse(s) def applyto(self, other, other_indices=None): return ApplicationExpression(self, other, other_indices) def __call__(self, other): return self.applyto(other) def __repr__(self): return '<%s %s>' % (self.__class__.__name__, self) @python_2_unicode_compatible class AtomicExpression(Expression): def __init__(self, name, dependencies=None): """ :param name: str for the constant name :param dependencies: list of int for the indices on which this atom is dependent """ assert isinstance(name, string_types) self.name = name if not dependencies: dependencies = [] self.dependencies = dependencies def simplify(self, bindings=None): """ If 'self' is bound by 'bindings', return the atomic to which it is bound. Otherwise, return self. :param bindings: ``BindingDict`` A dictionary of bindings used to simplify :return: ``AtomicExpression`` """ if bindings and self in bindings: return bindings[self] else: return self def compile_pos(self, index_counter, glueFormulaFactory): """ From Iddo Lev's PhD Dissertation p108-109 :param index_counter: ``Counter`` for unique indices :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas """ self.dependencies = [] return (self, []) def compile_neg(self, index_counter, glueFormulaFactory): """ From Iddo Lev's PhD Dissertation p108-109 :param index_counter: ``Counter`` for unique indices :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas """ self.dependencies = [] return (self, []) def initialize_labels(self, fstruct): self.name = fstruct.initialize_label(self.name.lower()) def __eq__(self, other): return self.__class__ == other.__class__ and self.name == other.name def __ne__(self, other): return not self == other def __str__(self): accum = self.name if self.dependencies: accum += "%s" % self.dependencies return accum def __hash__(self): return hash(self.name) class ConstantExpression(AtomicExpression): def unify(self, other, bindings): """ If 'other' is a constant, then it must be equal to 'self'. If 'other' is a variable, then it must not be bound to anything other than 'self'. :param other: ``Expression`` :param bindings: ``BindingDict`` A dictionary of all current bindings :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new binding :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' """ assert isinstance(other, Expression) if isinstance(other, VariableExpression): try: return bindings + BindingDict([(other, self)]) except VariableBindingException: pass elif self == other: return bindings raise UnificationException(self, other, bindings) class VariableExpression(AtomicExpression): def unify(self, other, bindings): """ 'self' must not be bound to anything other than 'other'. :param other: ``Expression`` :param bindings: ``BindingDict`` A dictionary of all current bindings :return: ``BindingDict`` A new combined dictionary of of 'bindings' and the new binding :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' """ assert isinstance(other, Expression) try: if self == other: return bindings else: return bindings + BindingDict([(self, other)]) except VariableBindingException: raise UnificationException(self, other, bindings) @python_2_unicode_compatible class ImpExpression(Expression): def __init__(self, antecedent, consequent): """ :param antecedent: ``Expression`` for the antecedent :param consequent: ``Expression`` for the consequent """ assert isinstance(antecedent, Expression) assert isinstance(consequent, Expression) self.antecedent = antecedent self.consequent = consequent def simplify(self, bindings=None): return self.__class__(self.antecedent.simplify(bindings), self.consequent.simplify(bindings)) def unify(self, other, bindings): """ Both the antecedent and consequent of 'self' and 'other' must unify. :param other: ``ImpExpression`` :param bindings: ``BindingDict`` A dictionary of all current bindings :return: ``BindingDict`` A new combined dictionary of of 'bindings' and any new bindings :raise UnificationException: If 'self' and 'other' cannot be unified in the context of 'bindings' """ assert isinstance(other, ImpExpression) try: return bindings + self.antecedent.unify(other.antecedent, bindings) + self.consequent.unify(other.consequent, bindings) except VariableBindingException: raise UnificationException(self, other, bindings) def compile_pos(self, index_counter, glueFormulaFactory): """ From Iddo Lev's PhD Dissertation p108-109 :param index_counter: ``Counter`` for unique indices :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas :return: (``Expression``,set) for the compiled linear logic and any newly created glue formulas """ (a, a_new) = self.antecedent.compile_neg(index_counter, glueFormulaFactory) (c, c_new) = self.consequent.compile_pos(index_counter, glueFormulaFactory) return (ImpExpression(a,c), a_new + c_new) def compile_neg(self, index_counter, glueFormulaFactory): """ From Iddo Lev's PhD Dissertation p108-109 :param index_counter: ``Counter`` for unique indices :param glueFormulaFactory: ``GlueFormula`` for creating new glue formulas :return: (``Expression``,list of ``GlueFormula``) for the compiled linear logic and any newly created glue formulas """ (a, a_new) = self.antecedent.compile_pos(index_counter, glueFormulaFactory) (c, c_new) = self.consequent.compile_neg(index_counter, glueFormulaFactory) fresh_index = index_counter.get() c.dependencies.append(fresh_index) new_v = glueFormulaFactory('v%s' % fresh_index, a, set([fresh_index])) return (c, a_new + c_new + [new_v]) def initialize_labels(self, fstruct): self.antecedent.initialize_labels(fstruct) self.consequent.initialize_labels(fstruct) def __eq__(self, other): return self.__class__ == other.__class__ and \ self.antecedent == other.antecedent and self.consequent == other.consequent def __ne__(self, other): return not self == other def __str__(self): return "%s%s %s %s%s" % ( Tokens.OPEN, self.antecedent, Tokens.IMP, self.consequent, Tokens.CLOSE) def __hash__(self): return hash('%s%s%s' % (hash(self.antecedent), Tokens.IMP, hash(self.consequent))) @python_2_unicode_compatible class ApplicationExpression(Expression): def __init__(self, function, argument, argument_indices=None): """ :param function: ``Expression`` for the function :param argument: ``Expression`` for the argument :param argument_indices: set for the indices of the glue formula from which the argument came :raise LinearLogicApplicationException: If 'function' cannot be applied to 'argument' given 'argument_indices'. """ function_simp = function.simplify() argument_simp = argument.simplify() assert isinstance(function_simp, ImpExpression) assert isinstance(argument_simp, Expression) bindings = BindingDict() try: if isinstance(function, ApplicationExpression): bindings += function.bindings if isinstance(argument, ApplicationExpression): bindings += argument.bindings bindings += function_simp.antecedent.unify(argument_simp, bindings) except UnificationException as e: raise LinearLogicApplicationException('Cannot apply %s to %s. %s' % (function_simp, argument_simp, e)) # If you are running it on complied premises, more conditions apply if argument_indices: # A.dependencies of (A -o (B -o C)) must be a proper subset of argument_indices if not set(function_simp.antecedent.dependencies) < argument_indices: raise LinearLogicApplicationException('Dependencies unfulfilled when attempting to apply Linear Logic formula %s to %s' % (function_simp, argument_simp)) if set(function_simp.antecedent.dependencies) == argument_indices: raise LinearLogicApplicationException('Dependencies not a proper subset of indices when attempting to apply Linear Logic formula %s to %s' % (function_simp, argument_simp)) self.function = function self.argument = argument self.bindings = bindings def simplify(self, bindings=None): """ Since function is an implication, return its consequent. There should be no need to check that the application is valid since the checking is done by the constructor. :param bindings: ``BindingDict`` A dictionary of bindings used to simplify :return: ``Expression`` """ if not bindings: bindings = self.bindings return self.function.simplify(bindings).consequent def __eq__(self, other): return self.__class__ == other.__class__ and \ self.function == other.function and self.argument == other.argument def __ne__(self, other): return not self == other def __str__(self): return "%s" % self.function + Tokens.OPEN + "%s" % self.argument + Tokens.CLOSE def __hash__(self): return hash('%s%s%s' % (hash(self.antecedent), Tokens.OPEN, hash(self.consequent))) @python_2_unicode_compatible class BindingDict(object): def __init__(self, bindings=None): """ :param bindings: list [(``VariableExpression``, ``AtomicExpression``)] to initialize the dictionary dict {``VariableExpression``: ``AtomicExpression``} to initialize the dictionary """ self.d = {} if isinstance(bindings, dict): bindings = bindings.items() if bindings: for (v, b) in bindings: self[v] = b def __setitem__(self, variable, binding): """ A binding is consistent with the dict if its variable is not already bound, OR if its variable is already bound to its argument. :param variable: ``VariableExpression`` The variable bind :param binding: ``Expression`` The expression to which 'variable' should be bound :raise VariableBindingException: If the variable cannot be bound in this dictionary """ assert isinstance(variable, VariableExpression) assert isinstance(binding, Expression) assert variable != binding existing = self.d.get(variable, None) if not existing or binding == existing: self.d[variable] = binding else: raise VariableBindingException('Variable %s already bound to another value' % (variable)) def __getitem__(self, variable): """ Return the expression to which 'variable' is bound """ assert isinstance(variable, VariableExpression) intermediate = self.d[variable] while intermediate: try: intermediate = self.d[intermediate] except KeyError: return intermediate def __contains__(self, item): return item in self.d def __add__(self, other): """ :param other: ``BindingDict`` The dict with which to combine self :return: ``BindingDict`` A new dict containing all the elements of both parameters :raise VariableBindingException: If the parameter dictionaries are not consistent with each other """ try: combined = BindingDict() for v in self.d: combined[v] = self.d[v] for v in other.d: combined[v] = other.d[v] return combined except VariableBindingException: raise VariableBindingException('Attempting to add two contradicting'\ ' VariableBindingsLists: %s, %s' % (self, other)) def __ne__(self, other): return not self == other def __eq__(self, other): if not isinstance(other, BindingDict): raise TypeError return self.d == other.d def __str__(self): return '{' + ', '.join('%s: %s' % (v, self.d[v]) for v in self.d) + '}' def __repr__(self): return 'BindingDict: %s' % self class VariableBindingException(Exception): pass class UnificationException(Exception): def __init__(self, a, b, bindings): Exception.__init__(self, 'Cannot unify %s with %s given %s' % (a, b, bindings)) class LinearLogicApplicationException(Exception): pass def demo(): lexpr = Expression.fromstring print(lexpr(r'f')) print(lexpr(r'(g -o f)')) print(lexpr(r'((g -o G) -o G)')) print(lexpr(r'g -o h -o f')) print(lexpr(r'(g -o f)(g)').simplify()) print(lexpr(r'(H -o f)(g)').simplify()) print(lexpr(r'((g -o G) -o G)((g -o f))').simplify()) print(lexpr(r'(H -o H)((g -o f))').simplify()) if __name__ == '__main__': demo() nltk-3.1/nltk/sem/logic.py0000644000076500000240000020547512607224144015276 0ustar sbstaff00000000000000# Natural Language Toolkit: Logic # # Author: Dan Garrette # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT """ A version of first order predicate logic, built on top of the typed lambda calculus. """ from __future__ import print_function, unicode_literals import re import operator from collections import defaultdict from functools import reduce from nltk.internals import Counter from nltk.compat import (total_ordering, string_types, python_2_unicode_compatible) APP = 'APP' _counter = Counter() class Tokens(object): LAMBDA = '\\'; LAMBDA_LIST = ['\\'] #Quantifiers EXISTS = 'exists'; EXISTS_LIST = ['some', 'exists', 'exist'] ALL = 'all'; ALL_LIST = ['all', 'forall'] #Punctuation DOT = '.' OPEN = '(' CLOSE = ')' COMMA = ',' #Operations NOT = '-'; NOT_LIST = ['not', '-', '!'] AND = '&'; AND_LIST = ['and', '&', '^'] OR = '|'; OR_LIST = ['or', '|'] IMP = '->'; IMP_LIST = ['implies', '->', '=>'] IFF = '<->'; IFF_LIST = ['iff', '<->', '<=>'] EQ = '='; EQ_LIST = ['=', '=='] NEQ = '!='; NEQ_LIST = ['!='] #Collections of tokens BINOPS = AND_LIST + OR_LIST + IMP_LIST + IFF_LIST QUANTS = EXISTS_LIST + ALL_LIST PUNCT = [DOT, OPEN, CLOSE, COMMA] TOKENS = BINOPS + EQ_LIST + NEQ_LIST + QUANTS + LAMBDA_LIST + PUNCT + NOT_LIST #Special SYMBOLS = [x for x in TOKENS if re.match(r'^[-\\.(),!&^|>=<]*$', x)] def boolean_ops(): """ Boolean operators """ names = ["negation", "conjunction", "disjunction", "implication", "equivalence"] for pair in zip(names, [Tokens.NOT, Tokens.AND, Tokens.OR, Tokens.IMP, Tokens.IFF]): print("%-15s\t%s" % pair) def equality_preds(): """ Equality predicates """ names = ["equality", "inequality"] for pair in zip(names, [Tokens.EQ, Tokens.NEQ]): print("%-15s\t%s" % pair) def binding_ops(): """ Binding operators """ names = ["existential", "universal", "lambda"] for pair in zip(names, [Tokens.EXISTS, Tokens.ALL, Tokens.LAMBDA]): print("%-15s\t%s" % pair) @python_2_unicode_compatible class LogicParser(object): """A lambda calculus expression parser.""" def __init__(self, type_check=False): """ :param type_check: bool should type checking be performed? to their types. """ assert isinstance(type_check, bool) self._currentIndex = 0 self._buffer = [] self.type_check = type_check """A list of tuples of quote characters. The 4-tuple is comprised of the start character, the end character, the escape character, and a boolean indicating whether the quotes should be included in the result. Quotes are used to signify that a token should be treated as atomic, ignoring any special characters within the token. The escape character allows the quote end character to be used within the quote. If True, the boolean indicates that the final token should contain the quote and escape characters. This method exists to be overridden""" self.quote_chars = [] self.operator_precedence = dict( [(x,1) for x in Tokens.LAMBDA_LIST] + \ [(x,2) for x in Tokens.NOT_LIST] + \ [(APP,3)] + \ [(x,4) for x in Tokens.EQ_LIST+Tokens.NEQ_LIST] + \ [(x,5) for x in Tokens.QUANTS] + \ [(x,6) for x in Tokens.AND_LIST] + \ [(x,7) for x in Tokens.OR_LIST] + \ [(x,8) for x in Tokens.IMP_LIST] + \ [(x,9) for x in Tokens.IFF_LIST] + \ [(None,10)]) self.right_associated_operations = [APP] def parse(self, data, signature=None): """ Parse the expression. :param data: str for the input to be parsed :param signature: ``dict`` that maps variable names to type strings :returns: a parsed Expression """ data = data.rstrip() self._currentIndex = 0 self._buffer, mapping = self.process(data) try: result = self.process_next_expression(None) if self.inRange(0): raise UnexpectedTokenException(self._currentIndex+1, self.token(0)) except LogicalExpressionException as e: msg = '%s\n%s\n%s^' % (e, data, ' '*mapping[e.index-1]) raise LogicalExpressionException(None, msg) if self.type_check: result.typecheck(signature) return result def process(self, data): """Split the data into tokens""" out = [] mapping = {} tokenTrie = StringTrie(self.get_all_symbols()) token = '' data_idx = 0 token_start_idx = data_idx while data_idx < len(data): cur_data_idx = data_idx quoted_token, data_idx = self.process_quoted_token(data_idx, data) if quoted_token: if not token: token_start_idx = cur_data_idx token += quoted_token continue st = tokenTrie c = data[data_idx] symbol = '' while c in st: symbol += c st = st[c] if len(data)-data_idx > len(symbol): c = data[data_idx+len(symbol)] else: break if StringTrie.LEAF in st: #token is a complete symbol if token: mapping[len(out)] = token_start_idx out.append(token) token = '' mapping[len(out)] = data_idx out.append(symbol) data_idx += len(symbol) else: if data[data_idx] in ' \t\n': #any whitespace if token: mapping[len(out)] = token_start_idx out.append(token) token = '' else: if not token: token_start_idx = data_idx token += data[data_idx] data_idx += 1 if token: mapping[len(out)] = token_start_idx out.append(token) mapping[len(out)] = len(data) mapping[len(out)+1] = len(data)+1 return out, mapping def process_quoted_token(self, data_idx, data): token = '' c = data[data_idx] i = data_idx for start, end, escape, incl_quotes in self.quote_chars: if c == start: if incl_quotes: token += c i += 1 while data[i] != end: if data[i] == escape: if incl_quotes: token += data[i] i += 1 if len(data) == i: #if there are no more chars raise LogicalExpressionException(None, "End of input reached. " "Escape character [%s] found at end." % escape) token += data[i] else: token += data[i] i += 1 if len(data) == i: raise LogicalExpressionException(None, "End of input reached. " "Expected: [%s]" % end) if incl_quotes: token += data[i] i += 1 if not token: raise LogicalExpressionException(None, 'Empty quoted token found') break return token, i def get_all_symbols(self): """This method exists to be overridden""" return Tokens.SYMBOLS def inRange(self, location): """Return TRUE if the given location is within the buffer""" return self._currentIndex+location < len(self._buffer) def token(self, location=None): """Get the next waiting token. If a location is given, then return the token at currentIndex+location without advancing currentIndex; setting it gives lookahead/lookback capability.""" try: if location is None: tok = self._buffer[self._currentIndex] self._currentIndex += 1 else: tok = self._buffer[self._currentIndex+location] return tok except IndexError: raise ExpectedMoreTokensException(self._currentIndex+1) def isvariable(self, tok): return tok not in Tokens.TOKENS def process_next_expression(self, context): """Parse the next complete expression from the stream and return it.""" try: tok = self.token() except ExpectedMoreTokensException: raise ExpectedMoreTokensException(self._currentIndex+1, message='Expression expected.') accum = self.handle(tok, context) if not accum: raise UnexpectedTokenException(self._currentIndex, tok, message='Expression expected.') return self.attempt_adjuncts(accum, context) def handle(self, tok, context): """This method is intended to be overridden for logics that use different operators or expressions""" if self.isvariable(tok): return self.handle_variable(tok, context) elif tok in Tokens.NOT_LIST: return self.handle_negation(tok, context) elif tok in Tokens.LAMBDA_LIST: return self.handle_lambda(tok, context) elif tok in Tokens.QUANTS: return self.handle_quant(tok, context) elif tok == Tokens.OPEN: return self.handle_open(tok, context) def attempt_adjuncts(self, expression, context): cur_idx = None while cur_idx != self._currentIndex: #while adjuncts are added cur_idx = self._currentIndex expression = self.attempt_EqualityExpression(expression, context) expression = self.attempt_ApplicationExpression(expression, context) expression = self.attempt_BooleanExpression(expression, context) return expression def handle_negation(self, tok, context): return self.make_NegatedExpression(self.process_next_expression(Tokens.NOT)) def make_NegatedExpression(self, expression): return NegatedExpression(expression) def handle_variable(self, tok, context): #It's either: 1) a predicate expression: sees(x,y) # 2) an application expression: P(x) # 3) a solo variable: john OR x accum = self.make_VariableExpression(tok) if self.inRange(0) and self.token(0) == Tokens.OPEN: #The predicate has arguments if not isinstance(accum, FunctionVariableExpression) and \ not isinstance(accum, ConstantExpression): raise LogicalExpressionException(self._currentIndex, "'%s' is an illegal predicate name. " "Individual variables may not be used as " "predicates." % tok) self.token() #swallow the Open Paren #curry the arguments accum = self.make_ApplicationExpression(accum, self.process_next_expression(APP)) while self.inRange(0) and self.token(0) == Tokens.COMMA: self.token() #swallow the comma accum = self.make_ApplicationExpression(accum, self.process_next_expression(APP)) self.assertNextToken(Tokens.CLOSE) return accum def get_next_token_variable(self, description): try: tok = self.token() except ExpectedMoreTokensException as e: raise ExpectedMoreTokensException(e.index, 'Variable expected.') if isinstance(self.make_VariableExpression(tok), ConstantExpression): raise LogicalExpressionException(self._currentIndex, "'%s' is an illegal variable name. " "Constants may not be %s." % (tok, description)) return Variable(tok) def handle_lambda(self, tok, context): # Expression is a lambda expression if not self.inRange(0): raise ExpectedMoreTokensException(self._currentIndex+2, message="Variable and Expression expected following lambda operator.") vars = [self.get_next_token_variable('abstracted')] while True: if not self.inRange(0) or (self.token(0) == Tokens.DOT and not self.inRange(1)): raise ExpectedMoreTokensException(self._currentIndex+2, message="Expression expected.") if not self.isvariable(self.token(0)): break # Support expressions like: \x y.M == \x.\y.M vars.append(self.get_next_token_variable('abstracted')) if self.inRange(0) and self.token(0) == Tokens.DOT: self.token() #swallow the dot accum = self.process_next_expression(tok) while vars: accum = self.make_LambdaExpression(vars.pop(), accum) return accum def handle_quant(self, tok, context): # Expression is a quantified expression: some x.M factory = self.get_QuantifiedExpression_factory(tok) if not self.inRange(0): raise ExpectedMoreTokensException(self._currentIndex+2, message="Variable and Expression expected following quantifier '%s'." % tok) vars = [self.get_next_token_variable('quantified')] while True: if not self.inRange(0) or (self.token(0) == Tokens.DOT and not self.inRange(1)): raise ExpectedMoreTokensException(self._currentIndex+2, message="Expression expected.") if not self.isvariable(self.token(0)): break # Support expressions like: some x y.M == some x.some y.M vars.append(self.get_next_token_variable('quantified')) if self.inRange(0) and self.token(0) == Tokens.DOT: self.token() #swallow the dot accum = self.process_next_expression(tok) while vars: accum = self.make_QuanifiedExpression(factory, vars.pop(), accum) return accum def get_QuantifiedExpression_factory(self, tok): """This method serves as a hook for other logic parsers that have different quantifiers""" if tok in Tokens.EXISTS_LIST: return ExistsExpression elif tok in Tokens.ALL_LIST: return AllExpression else: self.assertToken(tok, Tokens.QUANTS) def make_QuanifiedExpression(self, factory, variable, term): return factory(variable, term) def handle_open(self, tok, context): #Expression is in parens accum = self.process_next_expression(None) self.assertNextToken(Tokens.CLOSE) return accum def attempt_EqualityExpression(self, expression, context): """Attempt to make an equality expression. If the next token is an equality operator, then an EqualityExpression will be returned. Otherwise, the parameter will be returned.""" if self.inRange(0): tok = self.token(0) if tok in Tokens.EQ_LIST + Tokens.NEQ_LIST and self.has_priority(tok, context): self.token() #swallow the "=" or "!=" expression = self.make_EqualityExpression(expression, self.process_next_expression(tok)) if tok in Tokens.NEQ_LIST: expression = self.make_NegatedExpression(expression) return expression def make_EqualityExpression(self, first, second): """This method serves as a hook for other logic parsers that have different equality expression classes""" return EqualityExpression(first, second) def attempt_BooleanExpression(self, expression, context): """Attempt to make a boolean expression. If the next token is a boolean operator, then a BooleanExpression will be returned. Otherwise, the parameter will be returned.""" while self.inRange(0): tok = self.token(0) factory = self.get_BooleanExpression_factory(tok) if factory and self.has_priority(tok, context): self.token() #swallow the operator expression = self.make_BooleanExpression(factory, expression, self.process_next_expression(tok)) else: break return expression def get_BooleanExpression_factory(self, tok): """This method serves as a hook for other logic parsers that have different boolean operators""" if tok in Tokens.AND_LIST: return AndExpression elif tok in Tokens.OR_LIST: return OrExpression elif tok in Tokens.IMP_LIST: return ImpExpression elif tok in Tokens.IFF_LIST: return IffExpression else: return None def make_BooleanExpression(self, factory, first, second): return factory(first, second) def attempt_ApplicationExpression(self, expression, context): """Attempt to make an application expression. The next tokens are a list of arguments in parens, then the argument expression is a function being applied to the arguments. Otherwise, return the argument expression.""" if self.has_priority(APP, context): if self.inRange(0) and self.token(0) == Tokens.OPEN: if not isinstance(expression, LambdaExpression) and \ not isinstance(expression, ApplicationExpression) and \ not isinstance(expression, FunctionVariableExpression) and \ not isinstance(expression, ConstantExpression): raise LogicalExpressionException(self._currentIndex, ("The function '%s" % expression) + "' is not a Lambda Expression, an " "Application Expression, or a " "functional predicate, so it may " "not take arguments.") self.token() #swallow then open paren #curry the arguments accum = self.make_ApplicationExpression(expression, self.process_next_expression(APP)) while self.inRange(0) and self.token(0) == Tokens.COMMA: self.token() #swallow the comma accum = self.make_ApplicationExpression(accum, self.process_next_expression(APP)) self.assertNextToken(Tokens.CLOSE) return accum return expression def make_ApplicationExpression(self, function, argument): return ApplicationExpression(function, argument) def make_VariableExpression(self, name): return VariableExpression(Variable(name)) def make_LambdaExpression(self, variable, term): return LambdaExpression(variable, term) def has_priority(self, operation, context): return self.operator_precedence[operation] < self.operator_precedence[context] or \ (operation in self.right_associated_operations and \ self.operator_precedence[operation] == self.operator_precedence[context]) def assertNextToken(self, expected): try: tok = self.token() except ExpectedMoreTokensException as e: raise ExpectedMoreTokensException(e.index, message="Expected token '%s'." % expected) if isinstance(expected, list): if tok not in expected: raise UnexpectedTokenException(self._currentIndex, tok, expected) else: if tok != expected: raise UnexpectedTokenException(self._currentIndex, tok, expected) def assertToken(self, tok, expected): if isinstance(expected, list): if tok not in expected: raise UnexpectedTokenException(self._currentIndex, tok, expected) else: if tok != expected: raise UnexpectedTokenException(self._currentIndex, tok, expected) def __repr__(self): if self.inRange(0): msg = 'Next token: ' + self.token(0) else: msg = 'No more tokens' return '<' + self.__class__.__name__ + ': ' + msg + '>' def read_logic(s, logic_parser=None, encoding=None): """ Convert a file of First Order Formulas into a list of {Expression}s. :param s: the contents of the file :type s: str :param logic_parser: The parser to be used to parse the logical expression :type logic_parser: LogicParser :param encoding: the encoding of the input string, if it is binary :type encoding: str :return: a list of parsed formulas. :rtype: list(Expression) """ if encoding is not None: s = s.decode(encoding) if logic_parser is None: logic_parser = LogicParser() statements = [] for linenum, line in enumerate(s.splitlines()): line = line.strip() if line.startswith('#') or line=='': continue try: statements.append(logic_parser.parse(line)) except LogicalExpressionException: raise ValueError('Unable to parse line %s: %s' % (linenum, line)) return statements @total_ordering @python_2_unicode_compatible class Variable(object): def __init__(self, name): """ :param name: the name of the variable """ assert isinstance(name, string_types), "%s is not a string" % name self.name = name def __eq__(self, other): return isinstance(other, Variable) and self.name == other.name def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, Variable): raise TypeError return self.name < other.name def substitute_bindings(self, bindings): return bindings.get(self, self) def __hash__(self): return hash(self.name) def __str__(self): return self.name def __repr__(self): return "Variable('%s')" % self.name def unique_variable(pattern=None, ignore=None): """ Return a new, unique variable. :param pattern: ``Variable`` that is being replaced. The new variable must be the same type. :param term: a set of ``Variable`` objects that should not be returned from this function. :rtype: Variable """ if pattern is not None: if is_indvar(pattern.name): prefix = 'z' elif is_funcvar(pattern.name): prefix = 'F' elif is_eventvar(pattern.name): prefix = 'e0' else: assert False, "Cannot generate a unique constant" else: prefix = 'z' v = Variable("%s%s" % (prefix, _counter.get())) while ignore is not None and v in ignore: v = Variable("%s%s" % (prefix, _counter.get())) return v def skolem_function(univ_scope=None): """ Return a skolem function over the variables in univ_scope param univ_scope """ skolem = VariableExpression(Variable('F%s' % _counter.get())) if univ_scope: for v in list(univ_scope): skolem = skolem(VariableExpression(v)) return skolem @python_2_unicode_compatible class Type(object): def __repr__(self): return "%s" % self def __hash__(self): return hash("%s" % self) @classmethod def fromstring(cls, s): return read_type(s) @python_2_unicode_compatible class ComplexType(Type): def __init__(self, first, second): assert(isinstance(first, Type)), "%s is not a Type" % first assert(isinstance(second, Type)), "%s is not a Type" % second self.first = first self.second = second def __eq__(self, other): return isinstance(other, ComplexType) and \ self.first == other.first and \ self.second == other.second def __ne__(self, other): return not self == other __hash__ = Type.__hash__ def matches(self, other): if isinstance(other, ComplexType): return self.first.matches(other.first) and \ self.second.matches(other.second) else: return self == ANY_TYPE def resolve(self, other): if other == ANY_TYPE: return self elif isinstance(other, ComplexType): f = self.first.resolve(other.first) s = self.second.resolve(other.second) if f and s: return ComplexType(f,s) else: return None elif self == ANY_TYPE: return other else: return None def __str__(self): if self == ANY_TYPE: return "%s" % ANY_TYPE else: return '<%s,%s>' % (self.first, self.second) def str(self): if self == ANY_TYPE: return ANY_TYPE.str() else: return '(%s -> %s)' % (self.first.str(), self.second.str()) class BasicType(Type): def __eq__(self, other): return isinstance(other, BasicType) and ("%s" % self) == ("%s" % other) def __ne__(self, other): return not self == other __hash__ = Type.__hash__ def matches(self, other): return other == ANY_TYPE or self == other def resolve(self, other): if self.matches(other): return self else: return None @python_2_unicode_compatible class EntityType(BasicType): def __str__(self): return 'e' def str(self): return 'IND' @python_2_unicode_compatible class TruthValueType(BasicType): def __str__(self): return 't' def str(self): return 'BOOL' @python_2_unicode_compatible class EventType(BasicType): def __str__(self): return 'v' def str(self): return 'EVENT' @python_2_unicode_compatible class AnyType(BasicType, ComplexType): def __init__(self): pass @property def first(self): return self @property def second(self): return self def __eq__(self, other): return isinstance(other, AnyType) or other.__eq__(self) def __ne__(self, other): return not self == other __hash__ = Type.__hash__ def matches(self, other): return True def resolve(self, other): return other def __str__(self): return '?' def str(self): return 'ANY' TRUTH_TYPE = TruthValueType() ENTITY_TYPE = EntityType() EVENT_TYPE = EventType() ANY_TYPE = AnyType() def read_type(type_string): assert isinstance(type_string, string_types) type_string = type_string.replace(' ', '') #remove spaces if type_string[0] == '<': assert type_string[-1] == '>' paren_count = 0 for i,char in enumerate(type_string): if char == '<': paren_count += 1 elif char == '>': paren_count -= 1 assert paren_count > 0 elif char == ',': if paren_count == 1: break return ComplexType(read_type(type_string[1 :i ]), read_type(type_string[i+1:-1])) elif type_string[0] == "%s" % ENTITY_TYPE: return ENTITY_TYPE elif type_string[0] == "%s" % TRUTH_TYPE: return TRUTH_TYPE elif type_string[0] == "%s" % ANY_TYPE: return ANY_TYPE else: raise LogicalExpressionException("Unexpected character: '%s'." % type_string[0]) class TypeException(Exception): def __init__(self, msg): Exception.__init__(self, msg) class InconsistentTypeHierarchyException(TypeException): def __init__(self, variable, expression=None): if expression: msg = "The variable '%s' was found in multiple places with different"\ " types in '%s'." % (variable, expression) else: msg = "The variable '%s' was found in multiple places with different"\ " types." % (variable) Exception.__init__(self, msg) class TypeResolutionException(TypeException): def __init__(self, expression, other_type): Exception.__init__(self, "The type of '%s', '%s', cannot be " "resolved with type '%s'" % \ (expression, expression.type, other_type)) class IllegalTypeException(TypeException): def __init__(self, expression, other_type, allowed_type): Exception.__init__(self, "Cannot set type of %s '%s' to '%s'; " "must match type '%s'." % (expression.__class__.__name__, expression, other_type, allowed_type)) def typecheck(expressions, signature=None): """ Ensure correct typing across a collection of ``Expression`` objects. :param expressions: a collection of expressions :param signature: dict that maps variable names to types (or string representations of types) """ #typecheck and create master signature for expression in expressions: signature = expression.typecheck(signature) #apply master signature to all expressions for expression in expressions[:-1]: expression.typecheck(signature) return signature class SubstituteBindingsI(object): """ An interface for classes that can perform substitutions for variables. """ def substitute_bindings(self, bindings): """ :return: The object that is obtained by replacing each variable bound by ``bindings`` with its values. Aliases are already resolved. (maybe?) :rtype: (any) """ raise NotImplementedError() def variables(self): """ :return: A list of all variables in this object. """ raise NotImplementedError() @python_2_unicode_compatible class Expression(SubstituteBindingsI): """This is the base abstract object for all logical expressions""" _logic_parser = LogicParser() _type_checking_logic_parser = LogicParser(type_check=True) @classmethod def fromstring(cls, s, type_check=False, signature=None): if type_check: return cls._type_checking_logic_parser.parse(s, signature) else: return cls._logic_parser.parse(s, signature) def __call__(self, other, *additional): accum = self.applyto(other) for a in additional: accum = accum(a) return accum def applyto(self, other): assert isinstance(other, Expression), "%s is not an Expression" % other return ApplicationExpression(self, other) def __neg__(self): return NegatedExpression(self) def negate(self): """If this is a negated expression, remove the negation. Otherwise add a negation.""" return -self def __and__(self, other): if not isinstance(other, Expression): raise TypeError("%s is not an Expression" % other) return AndExpression(self, other) def __or__(self, other): if not isinstance(other, Expression): raise TypeError("%s is not an Expression" % other) return OrExpression(self, other) def __gt__(self, other): if not isinstance(other, Expression): raise TypeError("%s is not an Expression" % other) return ImpExpression(self, other) def __lt__(self, other): if not isinstance(other, Expression): raise TypeError("%s is not an Expression" % other) return IffExpression(self, other) def __eq__(self, other): raise NotImplementedError() def __ne__(self, other): return not self == other def equiv(self, other, prover=None): """ Check for logical equivalence. Pass the expression (self <-> other) to the theorem prover. If the prover says it is valid, then the self and other are equal. :param other: an ``Expression`` to check equality against :param prover: a ``nltk.inference.api.Prover`` """ assert isinstance(other, Expression), "%s is not an Expression" % other if prover is None: from nltk.inference import Prover9 prover = Prover9() bicond = IffExpression(self.simplify(), other.simplify()) return prover.prove(bicond) def __hash__(self): return hash(repr(self)) def substitute_bindings(self, bindings): expr = self for var in expr.variables(): if var in bindings: val = bindings[var] if isinstance(val, Variable): val = self.make_VariableExpression(val) elif not isinstance(val, Expression): raise ValueError('Can not substitute a non-expression ' 'value into an expression: %r' % (val,)) # Substitute bindings in the target value. val = val.substitute_bindings(bindings) # Replace var w/ the target value. expr = expr.replace(var, val) return expr.simplify() def typecheck(self, signature=None): """ Infer and check types. Raise exceptions if necessary. :param signature: dict that maps variable names to types (or string representations of types) :return: the signature, plus any additional type mappings """ sig = defaultdict(list) if signature: for key in signature: val = signature[key] varEx = VariableExpression(Variable(key)) if isinstance(val, Type): varEx.type = val else: varEx.type = read_type(val) sig[key].append(varEx) self._set_type(signature=sig) return dict((key, sig[key][0].type) for key in sig) def findtype(self, variable): """ Find the type of the given variable as it is used in this expression. For example, finding the type of "P" in "P(x) & Q(x,y)" yields "" :param variable: Variable """ raise NotImplementedError() def _set_type(self, other_type=ANY_TYPE, signature=None): """ Set the type of this expression to be the given type. Raise type exceptions where applicable. :param other_type: Type :param signature: dict(str -> list(AbstractVariableExpression)) """ raise NotImplementedError() def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """ Replace every instance of 'variable' with 'expression' :param variable: ``Variable`` The variable to replace :param expression: ``Expression`` The expression with which to replace it :param replace_bound: bool Should bound variables be replaced? :param alpha_convert: bool Alpha convert automatically to avoid name clashes? """ assert isinstance(variable, Variable), "%s is not a Variable" % variable assert isinstance(expression, Expression), "%s is not an Expression" % expression return self.visit_structured(lambda e: e.replace(variable, expression, replace_bound, alpha_convert), self.__class__) def normalize(self, newvars=None): """Rename auto-generated unique variables""" def get_indiv_vars(e): if isinstance(e, IndividualVariableExpression): return set([e]) elif isinstance(e, AbstractVariableExpression): return set() else: return e.visit(get_indiv_vars, lambda parts: reduce(operator.or_, parts, set())) result = self for i,e in enumerate(sorted(get_indiv_vars(self), key=lambda e: e.variable)): if isinstance(e,EventVariableExpression): newVar = e.__class__(Variable('e0%s' % (i+1))) elif isinstance(e,IndividualVariableExpression): newVar = e.__class__(Variable('z%s' % (i+1))) else: newVar = e result = result.replace(e.variable, newVar, True) return result def visit(self, function, combinator): """ Recursively visit subexpressions. Apply 'function' to each subexpression and pass the result of each function application to the 'combinator' for aggregation: return combinator(map(function, self.subexpressions)) Bound variables are neither applied upon by the function nor given to the combinator. :param function: ``Function`` to call on each subexpression :param combinator: ``Function,R>`` to combine the results of the function calls :return: result of combination ``R`` """ raise NotImplementedError() def visit_structured(self, function, combinator): """ Recursively visit subexpressions. Apply 'function' to each subexpression and pass the result of each function application to the 'combinator' for aggregation. The combinator must have the same signature as the constructor. The function is not applied to bound variables, but they are passed to the combinator. :param function: ``Function`` to call on each subexpression :param combinator: ``Function`` with the same signature as the constructor, to combine the results of the function calls :return: result of combination """ return self.visit(function, lambda parts: combinator(*parts)) def __repr__(self): return '<%s %s>' % (self.__class__.__name__, self) def __str__(self): return self.str() def variables(self): """ Return a set of all the variables for binding substitution. The variables returned include all free (non-bound) individual variables and any variable starting with '?' or '@'. :return: set of ``Variable`` objects """ return self.free() | set(p for p in self.predicates()|self.constants() if re.match('^[?@]', p.name)) def free(self): """ Return a set of all the free (non-bound) variables. This includes both individual and predicate variables, but not constants. :return: set of ``Variable`` objects """ return self.visit(lambda e: e.free(), lambda parts: reduce(operator.or_, parts, set())) def constants(self): """ Return a set of individual constants (non-predicates). :return: set of ``Variable`` objects """ return self.visit(lambda e: e.constants(), lambda parts: reduce(operator.or_, parts, set())) def predicates(self): """ Return a set of predicates (constants, not variables). :return: set of ``Variable`` objects """ return self.visit(lambda e: e.predicates(), lambda parts: reduce(operator.or_, parts, set())) def simplify(self): """ :return: beta-converted version of this expression """ return self.visit_structured(lambda e: e.simplify(), self.__class__) def make_VariableExpression(self, variable): return VariableExpression(variable) @python_2_unicode_compatible class ApplicationExpression(Expression): r""" This class is used to represent two related types of logical expressions. The first is a Predicate Expression, such as "P(x,y)". A predicate expression is comprised of a ``FunctionVariableExpression`` or ``ConstantExpression`` as the predicate and a list of Expressions as the arguments. The second is a an application of one expression to another, such as "(\x.dog(x))(fido)". The reason Predicate Expressions are treated as Application Expressions is that the Variable Expression predicate of the expression may be replaced with another Expression, such as a LambdaExpression, which would mean that the Predicate should be thought of as being applied to the arguments. The logical expression reader will always curry arguments in a application expression. So, "\x y.see(x,y)(john,mary)" will be represented internally as "((\x y.(see(x))(y))(john))(mary)". This simplifies the internals since there will always be exactly one argument in an application. The str() method will usually print the curried forms of application expressions. The one exception is when the the application expression is really a predicate expression (ie, underlying function is an ``AbstractVariableExpression``). This means that the example from above will be returned as "(\x y.see(x,y)(john))(mary)". """ def __init__(self, function, argument): """ :param function: ``Expression``, for the function expression :param argument: ``Expression``, for the argument """ assert isinstance(function, Expression), "%s is not an Expression" % function assert isinstance(argument, Expression), "%s is not an Expression" % argument self.function = function self.argument = argument def simplify(self): function = self.function.simplify() argument = self.argument.simplify() if isinstance(function, LambdaExpression): return function.term.replace(function.variable, argument).simplify() else: return self.__class__(function, argument) @property def type(self): if isinstance(self.function.type, ComplexType): return self.function.type.second else: return ANY_TYPE def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) self.argument._set_type(ANY_TYPE, signature) try: self.function._set_type(ComplexType(self.argument.type, other_type), signature) except TypeResolutionException: raise TypeException( "The function '%s' is of type '%s' and cannot be applied " "to '%s' of type '%s'. Its argument must match type '%s'." % (self.function, self.function.type, self.argument, self.argument.type, self.function.type.first)) def findtype(self, variable): """:see Expression.findtype()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable if self.is_atom(): function, args = self.uncurry() else: #It's not a predicate expression ("P(x,y)"), so leave args curried function = self.function args = [self.argument] found = [arg.findtype(variable) for arg in [function]+args] unique = [] for f in found: if f != ANY_TYPE: if unique: for u in unique: if f.matches(u): break else: unique.append(f) if len(unique) == 1: return list(unique)[0] else: return ANY_TYPE def constants(self): """:see: Expression.constants()""" if isinstance(self.function, AbstractVariableExpression): function_constants = set() else: function_constants = self.function.constants() return function_constants | self.argument.constants() def predicates(self): """:see: Expression.predicates()""" if isinstance(self.function, ConstantExpression): function_preds = set([self.function.variable]) else: function_preds = self.function.predicates() return function_preds | self.argument.predicates() def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.function), function(self.argument)]) def __eq__(self, other): return isinstance(other, ApplicationExpression) and \ self.function == other.function and \ self.argument == other.argument def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def __str__(self): # uncurry the arguments and find the base function if self.is_atom(): function, args = self.uncurry() arg_str = ','.join("%s" % arg for arg in args) else: #Leave arguments curried function = self.function arg_str = "%s" % self.argument function_str = "%s" % function parenthesize_function = False if isinstance(function, LambdaExpression): if isinstance(function.term, ApplicationExpression): if not isinstance(function.term.function, AbstractVariableExpression): parenthesize_function = True elif not isinstance(function.term, BooleanExpression): parenthesize_function = True elif isinstance(function, ApplicationExpression): parenthesize_function = True if parenthesize_function: function_str = Tokens.OPEN + function_str + Tokens.CLOSE return function_str + Tokens.OPEN + arg_str + Tokens.CLOSE def uncurry(self): """ Uncurry this application expression return: A tuple (base-function, arg-list) """ function = self.function args = [self.argument] while isinstance(function, ApplicationExpression): #(\x.\y.sees(x,y)(john))(mary) args.insert(0, function.argument) function = function.function return (function, args) @property def pred(self): """ Return uncurried base-function. If this is an atom, then the result will be a variable expression. Otherwise, it will be a lambda expression. """ return self.uncurry()[0] @property def args(self): """ Return uncurried arg-list """ return self.uncurry()[1] def is_atom(self): """ Is this expression an atom (as opposed to a lambda expression applied to a term)? """ return isinstance(self.pred, AbstractVariableExpression) @total_ordering @python_2_unicode_compatible class AbstractVariableExpression(Expression): """This class represents a variable to be used as a predicate or entity""" def __init__(self, variable): """ :param variable: ``Variable``, for the variable """ assert isinstance(variable, Variable), "%s is not a Variable" % variable self.variable = variable def simplify(self): return self def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """:see: Expression.replace()""" assert isinstance(variable, Variable), "%s is not an Variable" % variable assert isinstance(expression, Expression), "%s is not an Expression" % expression if self.variable == variable: return expression else: return self def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) resolution = other_type for varEx in signature[self.variable.name]: resolution = varEx.type.resolve(resolution) if not resolution: raise InconsistentTypeHierarchyException(self) signature[self.variable.name].append(self) for varEx in signature[self.variable.name]: varEx.type = resolution def findtype(self, variable): """:see Expression.findtype()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable if self.variable == variable: return self.type else: return ANY_TYPE def predicates(self): """:see: Expression.predicates()""" return set() def __eq__(self, other): """Allow equality between instances of ``AbstractVariableExpression`` subtypes.""" return isinstance(other, AbstractVariableExpression) and \ self.variable == other.variable def __ne__(self, other): return not self == other def __lt__(self, other): if not isinstance(other, AbstractVariableExpression): raise TypeError return self.variable < other.variable __hash__ = Expression.__hash__ def __str__(self): return "%s" % self.variable class IndividualVariableExpression(AbstractVariableExpression): """This class represents variables that take the form of a single lowercase character (other than 'e') followed by zero or more digits.""" def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(ENTITY_TYPE): raise IllegalTypeException(self, other_type, ENTITY_TYPE) signature[self.variable.name].append(self) def _get_type(self): return ENTITY_TYPE type = property(_get_type, _set_type) def free(self): """:see: Expression.free()""" return set([self.variable]) def constants(self): """:see: Expression.constants()""" return set() class FunctionVariableExpression(AbstractVariableExpression): """This class represents variables that take the form of a single uppercase character followed by zero or more digits.""" type = ANY_TYPE def free(self): """:see: Expression.free()""" return set([self.variable]) def constants(self): """:see: Expression.constants()""" return set() class EventVariableExpression(IndividualVariableExpression): """This class represents variables that take the form of a single lowercase 'e' character followed by zero or more digits.""" type = EVENT_TYPE class ConstantExpression(AbstractVariableExpression): """This class represents variables that do not take the form of a single character followed by zero or more digits.""" type = ENTITY_TYPE def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if other_type == ANY_TYPE: #entity type by default, for individuals resolution = ENTITY_TYPE else: resolution = other_type if self.type != ENTITY_TYPE: resolution = resolution.resolve(self.type) for varEx in signature[self.variable.name]: resolution = varEx.type.resolve(resolution) if not resolution: raise InconsistentTypeHierarchyException(self) signature[self.variable.name].append(self) for varEx in signature[self.variable.name]: varEx.type = resolution def free(self): """:see: Expression.free()""" return set() def constants(self): """:see: Expression.constants()""" return set([self.variable]) def VariableExpression(variable): """ This is a factory method that instantiates and returns a subtype of ``AbstractVariableExpression`` appropriate for the given variable. """ assert isinstance(variable, Variable), "%s is not a Variable" % variable if is_indvar(variable.name): return IndividualVariableExpression(variable) elif is_funcvar(variable.name): return FunctionVariableExpression(variable) elif is_eventvar(variable.name): return EventVariableExpression(variable) else: return ConstantExpression(variable) class VariableBinderExpression(Expression): """This an abstract class for any Expression that binds a variable in an Expression. This includes LambdaExpressions and Quantified Expressions""" def __init__(self, variable, term): """ :param variable: ``Variable``, for the variable :param term: ``Expression``, for the term """ assert isinstance(variable, Variable), "%s is not a Variable" % variable assert isinstance(term, Expression), "%s is not an Expression" % term self.variable = variable self.term = term def replace(self, variable, expression, replace_bound=False, alpha_convert=True): """:see: Expression.replace()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable assert isinstance(expression, Expression), "%s is not an Expression" % expression #if the bound variable is the thing being replaced if self.variable == variable: if replace_bound: assert isinstance(expression, AbstractVariableExpression),\ "%s is not a AbstractVariableExpression" % expression return self.__class__(expression.variable, self.term.replace(variable, expression, True, alpha_convert)) else: return self else: # if the bound variable appears in the expression, then it must # be alpha converted to avoid a conflict if alpha_convert and self.variable in expression.free(): self = self.alpha_convert(unique_variable(pattern=self.variable)) #replace in the term return self.__class__(self.variable, self.term.replace(variable, expression, replace_bound, alpha_convert)) def alpha_convert(self, newvar): """Rename all occurrences of the variable introduced by this variable binder in the expression to ``newvar``. :param newvar: ``Variable``, for the new variable """ assert isinstance(newvar, Variable), "%s is not a Variable" % newvar return self.__class__(newvar, self.term.replace(self.variable, VariableExpression(newvar), True)) def free(self): """:see: Expression.free()""" return self.term.free() - set([self.variable]) def findtype(self, variable): """:see Expression.findtype()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable if variable == self.variable: return ANY_TYPE else: return self.term.findtype(variable) def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.term)]) def visit_structured(self, function, combinator): """:see: Expression.visit_structured()""" return combinator(self.variable, function(self.term)) def __eq__(self, other): r"""Defines equality modulo alphabetic variance. If we are comparing \x.M and \y.N, then check equality of M and N[x/y].""" if isinstance(self, other.__class__) or \ isinstance(other, self.__class__): if self.variable == other.variable: return self.term == other.term else: # Comparing \x.M and \y.N. Relabel y in N with x and continue. varex = VariableExpression(self.variable) return self.term == other.term.replace(other.variable, varex) else: return False def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ @python_2_unicode_compatible class LambdaExpression(VariableBinderExpression): @property def type(self): return ComplexType(self.term.findtype(self.variable), self.term.type) def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) self.term._set_type(other_type.second, signature) if not self.type.resolve(other_type): raise TypeResolutionException(self, other_type) def __str__(self): variables = [self.variable] term = self.term while term.__class__ == self.__class__: variables.append(term.variable) term = term.term return Tokens.LAMBDA + ' '.join("%s" % v for v in variables) + \ Tokens.DOT + "%s" % term @python_2_unicode_compatible class QuantifiedExpression(VariableBinderExpression): @property def type(self): return TRUTH_TYPE def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(TRUTH_TYPE): raise IllegalTypeException(self, other_type, TRUTH_TYPE) self.term._set_type(TRUTH_TYPE, signature) def __str__(self): variables = [self.variable] term = self.term while term.__class__ == self.__class__: variables.append(term.variable) term = term.term return self.getQuantifier() + ' ' + ' '.join("%s" % v for v in variables) + \ Tokens.DOT + "%s" % term class ExistsExpression(QuantifiedExpression): def getQuantifier(self): return Tokens.EXISTS class AllExpression(QuantifiedExpression): def getQuantifier(self): return Tokens.ALL @python_2_unicode_compatible class NegatedExpression(Expression): def __init__(self, term): assert isinstance(term, Expression), "%s is not an Expression" % term self.term = term @property def type(self): return TRUTH_TYPE def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(TRUTH_TYPE): raise IllegalTypeException(self, other_type, TRUTH_TYPE) self.term._set_type(TRUTH_TYPE, signature) def findtype(self, variable): assert isinstance(variable, Variable), "%s is not a Variable" % variable return self.term.findtype(variable) def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.term)]) def negate(self): """:see: Expression.negate()""" return self.term def __eq__(self, other): return isinstance(other, NegatedExpression) and self.term == other.term def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def __str__(self): return Tokens.NOT + "%s" % self.term @python_2_unicode_compatible class BinaryExpression(Expression): def __init__(self, first, second): assert isinstance(first, Expression), "%s is not an Expression" % first assert isinstance(second, Expression), "%s is not an Expression" % second self.first = first self.second = second @property def type(self): return TRUTH_TYPE def findtype(self, variable): """:see Expression.findtype()""" assert isinstance(variable, Variable), "%s is not a Variable" % variable f = self.first.findtype(variable) s = self.second.findtype(variable) if f == s or s == ANY_TYPE: return f elif f == ANY_TYPE: return s else: return ANY_TYPE def visit(self, function, combinator): """:see: Expression.visit()""" return combinator([function(self.first), function(self.second)]) def __eq__(self, other): return (isinstance(self, other.__class__) or \ isinstance(other, self.__class__)) and \ self.first == other.first and self.second == other.second def __ne__(self, other): return not self == other __hash__ = Expression.__hash__ def __str__(self): first = self._str_subex(self.first) second = self._str_subex(self.second) return Tokens.OPEN + first + ' ' + self.getOp() \ + ' ' + second + Tokens.CLOSE def _str_subex(self, subex): return "%s" % subex class BooleanExpression(BinaryExpression): def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(TRUTH_TYPE): raise IllegalTypeException(self, other_type, TRUTH_TYPE) self.first._set_type(TRUTH_TYPE, signature) self.second._set_type(TRUTH_TYPE, signature) class AndExpression(BooleanExpression): """This class represents conjunctions""" def getOp(self): return Tokens.AND def _str_subex(self, subex): s = "%s" % subex if isinstance(subex, AndExpression): return s[1:-1] return s class OrExpression(BooleanExpression): """This class represents disjunctions""" def getOp(self): return Tokens.OR def _str_subex(self, subex): s = "%s" % subex if isinstance(subex, OrExpression): return s[1:-1] return s class ImpExpression(BooleanExpression): """This class represents implications""" def getOp(self): return Tokens.IMP class IffExpression(BooleanExpression): """This class represents biconditionals""" def getOp(self): return Tokens.IFF class EqualityExpression(BinaryExpression): """This class represents equality expressions like "(x = y)".""" def _set_type(self, other_type=ANY_TYPE, signature=None): """:see Expression._set_type()""" assert isinstance(other_type, Type) if signature is None: signature = defaultdict(list) if not other_type.matches(TRUTH_TYPE): raise IllegalTypeException(self, other_type, TRUTH_TYPE) self.first._set_type(ENTITY_TYPE, signature) self.second._set_type(ENTITY_TYPE, signature) def getOp(self): return Tokens.EQ ### Utilities class StringTrie(defaultdict): LEAF = "" def __init__(self, strings=None): defaultdict.__init__(self, StringTrie) if strings: for string in strings: self.insert(string) def insert(self, string): if len(string): self[string[0]].insert(string[1:]) else: #mark the string is complete self[StringTrie.LEAF] = None class LogicalExpressionException(Exception): def __init__(self, index, message): self.index = index Exception.__init__(self, message) class UnexpectedTokenException(LogicalExpressionException): def __init__(self, index, unexpected=None, expected=None, message=None): if unexpected and expected: msg = "Unexpected token: '%s'. " \ "Expected token '%s'." % (unexpected, expected) elif unexpected: msg = "Unexpected token: '%s'." % unexpected if message: msg += ' '+message else: msg = "Expected token '%s'." % expected LogicalExpressionException.__init__(self, index, msg) class ExpectedMoreTokensException(LogicalExpressionException): def __init__(self, index, message=None): if not message: message = 'More tokens expected.' LogicalExpressionException.__init__(self, index, 'End of input found. ' + message) def is_indvar(expr): """ An individual variable must be a single lowercase character other than 'e', followed by zero or more digits. :param expr: str :return: bool True if expr is of the correct form """ assert isinstance(expr, string_types), "%s is not a string" % expr return re.match(r'^[a-df-z]\d*$', expr) is not None def is_funcvar(expr): """ A function variable must be a single uppercase character followed by zero or more digits. :param expr: str :return: bool True if expr is of the correct form """ assert isinstance(expr, string_types), "%s is not a string" % expr return re.match(r'^[A-Z]\d*$', expr) is not None def is_eventvar(expr): """ An event variable must be a single lowercase 'e' character followed by zero or more digits. :param expr: str :return: bool True if expr is of the correct form """ assert isinstance(expr, string_types), "%s is not a string" % expr return re.match(r'^e\d*$', expr) is not None def demo(): lexpr = Expression.fromstring print('='*20 + 'Test reader' + '='*20) print(lexpr(r'john')) print(lexpr(r'man(x)')) print(lexpr(r'-man(x)')) print(lexpr(r'(man(x) & tall(x) & walks(x))')) print(lexpr(r'exists x.(man(x) & tall(x) & walks(x))')) print(lexpr(r'\x.man(x)')) print(lexpr(r'\x.man(x)(john)')) print(lexpr(r'\x y.sees(x,y)')) print(lexpr(r'\x y.sees(x,y)(a,b)')) print(lexpr(r'(\x.exists y.walks(x,y))(x)')) print(lexpr(r'exists x.x = y')) print(lexpr(r'exists x.(x = y)')) print(lexpr('P(x) & x=y & P(y)')) print(lexpr(r'\P Q.exists x.(P(x) & Q(x))')) print(lexpr(r'man(x) <-> tall(x)')) print('='*20 + 'Test simplify' + '='*20) print(lexpr(r'\x.\y.sees(x,y)(john)(mary)').simplify()) print(lexpr(r'\x.\y.sees(x,y)(john, mary)').simplify()) print(lexpr(r'all x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify()) print(lexpr(r'(\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x))(\x.bark(x))').simplify()) print('='*20 + 'Test alpha conversion and binder expression equality' + '='*20) e1 = lexpr('exists x.P(x)') print(e1) e2 = e1.alpha_convert(Variable('z')) print(e2) print(e1 == e2) def demo_errors(): print('='*20 + 'Test reader errors' + '='*20) demoException('(P(x) & Q(x)') demoException('((P(x) &) & Q(x))') demoException('P(x) -> ') demoException('P(x') demoException('P(x,') demoException('P(x,)') demoException('exists') demoException('exists x.') demoException('\\') demoException('\\ x y.') demoException('P(x)Q(x)') demoException('(P(x)Q(x)') demoException('exists x -> y') def demoException(s): try: Expression.fromstring(s) except LogicalExpressionException as e: print("%s: %s" % (e.__class__.__name__, e)) def printtype(ex): print("%s : %s" % (ex.str(), ex.type)) if __name__ == '__main__': demo() # demo_errors() nltk-3.1/nltk/sem/relextract.py0000644000076500000240000003531512607224144016350 0ustar sbstaff00000000000000# Natural Language Toolkit: Relation Extraction # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ Code for extracting relational triples from the ieer and conll2002 corpora. Relations are stored internally as dictionaries ('reldicts'). The two serialization outputs are "rtuple" and "clause". - An rtuple is a tuple of the form ``(subj, filler, obj)``, where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to circumvent locale variations in rendering utf-8 encoded strings. - A clause is an atom of the form ``relsym(subjsym, objsym)``, where the relation, subject and object have been canonicalized to single strings. """ from __future__ import print_function # todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs? from collections import defaultdict import re from nltk.compat import htmlentitydefs # Dictionary that associates corpora with NE classes NE_CLASSES = { 'ieer': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'], 'conll2002': ['LOC', 'PER', 'ORG'], 'ace': ['LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE', 'FACILITY', 'GPE'], } # Allow abbreviated class labels short2long = dict(LOC = 'LOCATION', ORG = 'ORGANIZATION', PER = 'PERSON') long2short = dict(LOCATION ='LOC', ORGANIZATION = 'ORG', PERSON = 'PER') def _expand(type): """ Expand an NE class name. :type type: str :rtype: str """ try: return short2long[type] except KeyError: return type def class_abbrev(type): """ Abbreviate an NE class name. :type type: str :rtype: str """ try: return long2short[type] except KeyError: return type def _join(lst, sep=' ', untag=False): """ Join a list into a string, turning tags tuples into tag strings or just words. :param untag: if ``True``, omit the tag from tagged input strings. :type lst: list :rtype: str """ try: return sep.join(lst) except TypeError: if untag: return sep.join(tup[0] for tup in lst) from nltk.tag import tuple2str return sep.join(tuple2str(tup) for tup in lst) def descape_entity(m, defs=htmlentitydefs.entitydefs): """ Translate one entity to its ISO Latin value. Inspired by example from effbot.org """ #s = 'mcglashan_&_sarrail' #l = ['mcglashan', '&', 'sarrail'] #pattern = re.compile("&(\w+?);") #new = list2sym(l) #s = pattern.sub(descape_entity, s) #print s, new try: return defs[m.group(1)] except KeyError: return m.group(0) # use as is def list2sym(lst): """ Convert a list of strings into a canonical symbol. :type lst: list :return: a Unicode string without whitespace :rtype: unicode """ sym = _join(lst, '_', untag=True) sym = sym.lower() ENT = re.compile("&(\w+?);") sym = ENT.sub(descape_entity, sym) sym = sym.replace('.', '') return sym def tree2semi_rel(tree): """ Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this identifies pairs whose first member is a list (possibly empty) of terminal strings, and whose second member is a ``Tree`` of the form (NE_label, terminals). :param tree: a chunk tree :return: a list of pairs (list(str), ``Tree``) :rtype: list of tuple """ from nltk.tree import Tree semi_rels = [] semi_rel = [[], None] for dtr in tree: if not isinstance(dtr, Tree): semi_rel[0].append(dtr) else: # dtr is a Tree semi_rel[1] = dtr semi_rels.append(semi_rel) semi_rel = [[], None] return semi_rels def semi_rel2reldict(pairs, window=5, trace=False): """ Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which stores information about the subject and object NEs plus the filler between them. Additionally, a left and right context of length =< window are captured (within a given input sentence). :param pairs: a pair of list(str) and ``Tree``, as generated by :param window: a threshold for the number of items to include in the left and right context :type window: int :return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon' :rtype: list(defaultdict) """ result = [] while len(pairs) > 2: reldict = defaultdict(str) reldict['lcon'] = _join(pairs[0][0][-window:]) reldict['subjclass'] = pairs[0][1].label() reldict['subjtext'] = _join(pairs[0][1].leaves()) reldict['subjsym'] = list2sym(pairs[0][1].leaves()) reldict['filler'] = _join(pairs[1][0]) reldict['untagged_filler'] = _join(pairs[1][0], untag=True) reldict['objclass'] = pairs[1][1].label() reldict['objtext'] = _join(pairs[1][1].leaves()) reldict['objsym'] = list2sym(pairs[1][1].leaves()) reldict['rcon'] = _join(pairs[2][0][:window]) if trace: print("(%s(%s, %s)" % (reldict['untagged_filler'], reldict['subjclass'], reldict['objclass'])) result.append(reldict) pairs = pairs[1:] return result def extract_rels(subjclass, objclass, doc, corpus='ace', pattern=None, window=10): """ Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern. The parameters ``subjclass`` and ``objclass`` can be used to restrict the Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', 'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'). :param subjclass: the class of the subject Named Entity. :type subjclass: str :param objclass: the class of the object Named Entity. :type objclass: str :param doc: input document :type doc: ieer document or a list of chunk trees :param corpus: name of the corpus to take as input; possible values are 'ieer' and 'conll2002' :type corpus: str :param pattern: a regular expression for filtering the fillers of retrieved triples. :type pattern: SRE_Pattern :param window: filters out fillers which exceed this threshold :type window: int :return: see ``mk_reldicts`` :rtype: list(defaultdict) """ if subjclass and subjclass not in NE_CLASSES[corpus]: if _expand(subjclass) in NE_CLASSES[corpus]: subjclass = _expand(subjclass) else: raise ValueError("your value for the subject type has not been recognized: %s" % subjclass) if objclass and objclass not in NE_CLASSES[corpus]: if _expand(objclass) in NE_CLASSES[corpus]: objclass = _expand(objclass) else: raise ValueError("your value for the object type has not been recognized: %s" % objclass) if corpus == 'ace' or corpus == 'conll2002': pairs = tree2semi_rel(doc) elif corpus == 'ieer': pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline) else: raise ValueError("corpus type not recognized") reldicts = semi_rel2reldict(pairs) relfilter = lambda x: (x['subjclass'] == subjclass and len(x['filler'].split()) <= window and pattern.match(x['filler']) and x['objclass'] == objclass) return list(filter(relfilter, reldicts)) def rtuple(reldict, lcon=False, rcon=False): """ Pretty print the reldict as an rtuple. :param reldict: a relation dictionary :type reldict: defaultdict """ items = [class_abbrev(reldict['subjclass']), reldict['subjtext'], reldict['filler'], class_abbrev(reldict['objclass']), reldict['objtext']] format = '[%s: %r] %r [%s: %r]' if lcon: items = [reldict['lcon']] + items format = '...%r)' + format if rcon: items.append(reldict['rcon']) format = format + '(%r...' printargs = tuple(items) return format % printargs def clause(reldict, relsym): """ Print the relation in clausal form. :param reldict: a relation dictionary :type reldict: defaultdict :param relsym: a label for the relation :type relsym: str """ items = (relsym, reldict['subjsym'], reldict['objsym']) return "%s(%r, %r)" % items ####################################################### # Demos of relation extraction with regular expressions ####################################################### ############################################ # Example of in(ORG, LOC) ############################################ def in_demo(trace=0, sql=True): """ Select pairs of organizations and locations whose mentions occur with an intervening occurrence of the preposition "in". If the sql parameter is set to True, then the entity pairs are loaded into an in-memory database, and subsequently pulled out using an SQL "SELECT" query. """ from nltk.corpus import ieer if sql: try: import sqlite3 connection = sqlite3.connect(":memory:") connection.text_factory = sqlite3.OptimizedUnicode cur = connection.cursor() cur.execute("""create table Locations (OrgName text, LocationName text, DocID text)""") except ImportError: import warnings warnings.warn("Cannot import sqlite; sql flag will be ignored.") IN = re.compile(r'.*\bin\b(?!\b.+ing)') print() print("IEER: in(ORG, LOC) -- just the clauses:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): if trace: print(doc.docno) print("=" * 15) for rel in extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern=IN): print(clause(rel, relsym='IN')) if sql: try: rtuple = (rel['subjtext'], rel['objtext'], doc.docno) cur.execute("""insert into Locations values (?, ?, ?)""", rtuple) connection.commit() except NameError: pass if sql: try: cur.execute("""select OrgName from Locations where LocationName = 'Atlanta'""") print() print("Extract data from SQL table: ORGs in Atlanta") print("-" * 15) for row in cur: print(row) except NameError: pass ############################################ # Example of has_role(PER, LOC) ############################################ def roles_demo(trace=0): from nltk.corpus import ieer roles = """ (.*( # assorted roles analyst| chair(wo)?man| commissioner| counsel| director| economist| editor| executive| foreman| governor| head| lawyer| leader| librarian).*)| manager| partner| president| producer| professor| researcher| spokes(wo)?man| writer| ,\sof\sthe?\s* # "X, of (the) Y" """ ROLES = re.compile(roles, re.VERBOSE) print() print("IEER: has_role(PER, ORG) -- raw rtuples:") print("=" * 45) for file in ieer.fileids(): for doc in ieer.parsed_docs(file): lcon = rcon = False if trace: print(doc.docno) print("=" * 15) lcon = rcon = True for rel in extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): print(rtuple(rel, lcon=lcon, rcon=rcon)) ############################################## ### Show what's in the IEER Headlines ############################################## def ieer_headlines(): from nltk.corpus import ieer from nltk.tree import Tree print("IEER: First 20 Headlines") print("=" * 45) trees = [(doc.docno, doc.headline) for file in ieer.fileids() for doc in ieer.parsed_docs(file)] for tree in trees[:20]: print() print("%s:\n%s" % tree) ############################################# ## Dutch CONLL2002: take_on_role(PER, ORG ############################################# def conllned(trace=1): """ Find the copula+'van' relation ('of') in the Dutch tagged training corpus from CoNLL 2002. """ from nltk.corpus import conll2002 vnv = """ ( is/V| # 3rd sing present and was/V| # past forms of the verb zijn ('be') werd/V| # and also present wordt/V # past of worden ('become) ) .* # followed by anything van/Prep # followed by van ('of') """ VAN = re.compile(vnv, re.VERBOSE) print() print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:") print("=" * 45) for doc in conll2002.chunked_sents('ned.train'): lcon = rcon = False if trace: lcon = rcon = True for rel in extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN, window=10): print(rtuple(rel, lcon=True, rcon=True)) ############################################# ## Spanish CONLL2002: (PER, ORG) ############################################# def conllesp(): from nltk.corpus import conll2002 de = """ .* ( de/SP| del/SP ) """ DE = re.compile(de, re.VERBOSE) print() print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") print("=" * 45) rels = [rel for doc in conll2002.chunked_sents('esp.train') for rel in extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] for r in rels[:10]: print(clause(r, relsym='DE')) print() def ne_chunked(): print() print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker") print("=" * 45) ROLE = re.compile(r'.*(chairman|president|trader|scientist|economist|analyst|partner).*') rels = [] for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]): sent = nltk.ne_chunk(sent) rels = extract_rels('PER', 'ORG', sent, corpus='ace', pattern=ROLE, window=7) for rel in rels: print('{0:<5}{1}'.format(i, rtuple(rel))) if __name__ == '__main__': import nltk from nltk.sem import relextract in_demo(trace=0) roles_demo(trace=0) conllned() conllesp() ieer_headlines() ne_chunked() nltk-3.1/nltk/sem/skolemize.py0000644000076500000240000001240712607224144016172 0ustar sbstaff00000000000000# Natural Language Toolkit: Semantic Interpretation # # Author: Ewan Klein # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from nltk.sem.logic import (AllExpression, AndExpression, ApplicationExpression, EqualityExpression, ExistsExpression, IffExpression, ImpExpression, NegatedExpression, OrExpression, VariableExpression, skolem_function, unique_variable) def skolemize(expression, univ_scope=None, used_variables=None): """ Skolemize the expression and convert to conjunctive normal form (CNF) """ if univ_scope is None: univ_scope = set() if used_variables is None: used_variables = set() if isinstance(expression, AllExpression): term = skolemize(expression.term, univ_scope|set([expression.variable]), used_variables|set([expression.variable])) return term.replace(expression.variable, VariableExpression(unique_variable(ignore=used_variables))) elif isinstance(expression, AndExpression): return skolemize(expression.first, univ_scope, used_variables) &\ skolemize(expression.second, univ_scope, used_variables) elif isinstance(expression, OrExpression): return to_cnf(skolemize(expression.first, univ_scope, used_variables), skolemize(expression.second, univ_scope, used_variables)) elif isinstance(expression, ImpExpression): return to_cnf(skolemize(-expression.first, univ_scope, used_variables), skolemize(expression.second, univ_scope, used_variables)) elif isinstance(expression, IffExpression): return to_cnf(skolemize(-expression.first, univ_scope, used_variables), skolemize(expression.second, univ_scope, used_variables)) &\ to_cnf(skolemize(expression.first, univ_scope, used_variables), skolemize(-expression.second, univ_scope, used_variables)) elif isinstance(expression, EqualityExpression): return expression elif isinstance(expression, NegatedExpression): negated = expression.term if isinstance(negated, AllExpression): term = skolemize(-negated.term, univ_scope, used_variables|set([negated.variable])) if univ_scope: return term.replace(negated.variable, skolem_function(univ_scope)) else: skolem_constant = VariableExpression(unique_variable(ignore=used_variables)) return term.replace(negated.variable, skolem_constant) elif isinstance(negated, AndExpression): return to_cnf(skolemize(-negated.first, univ_scope, used_variables), skolemize(-negated.second, univ_scope, used_variables)) elif isinstance(negated, OrExpression): return skolemize(-negated.first, univ_scope, used_variables) &\ skolemize(-negated.second, univ_scope, used_variables) elif isinstance(negated, ImpExpression): return skolemize(negated.first, univ_scope, used_variables) &\ skolemize(-negated.second, univ_scope, used_variables) elif isinstance(negated, IffExpression): return to_cnf(skolemize(-negated.first, univ_scope, used_variables), skolemize(-negated.second, univ_scope, used_variables)) &\ to_cnf(skolemize(negated.first, univ_scope, used_variables), skolemize(negated.second, univ_scope, used_variables)) elif isinstance(negated, EqualityExpression): return expression elif isinstance(negated, NegatedExpression): return skolemize(negated.term, univ_scope, used_variables) elif isinstance(negated, ExistsExpression): term = skolemize(-negated.term, univ_scope|set([negated.variable]), used_variables|set([negated.variable])) return term.replace(negated.variable, VariableExpression(unique_variable(ignore=used_variables))) elif isinstance(negated, ApplicationExpression): return expression else: raise Exception('\'%s\' cannot be skolemized' % expression) elif isinstance(expression, ExistsExpression): term = skolemize(expression.term, univ_scope, used_variables|set([expression.variable])) if univ_scope: return term.replace(expression.variable, skolem_function(univ_scope)) else: skolem_constant = VariableExpression(unique_variable(ignore=used_variables)) return term.replace(expression.variable, skolem_constant) elif isinstance(expression, ApplicationExpression): return expression else: raise Exception('\'%s\' cannot be skolemized' % expression) def to_cnf(first, second): """ Convert this split disjunction to conjunctive normal form (CNF) """ if isinstance(first, AndExpression): r_first = to_cnf(first.first, second) r_second = to_cnf(first.second, second) return r_first & r_second elif isinstance(second, AndExpression): r_first = to_cnf(first, second.first) r_second = to_cnf(first, second.second) return r_first & r_second else: return first | second nltk-3.1/nltk/sem/util.py0000644000076500000240000001766712607224144015162 0ustar sbstaff00000000000000# Natural Language Toolkit: Semantic Interpretation # # Author: Ewan Klein # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT """ Utility functions for batch-processing sentences: parsing and extraction of the semantic representation of the root node of the the syntax tree, followed by evaluation of the semantic representation in a first-order model. """ from __future__ import print_function, unicode_literals import codecs from nltk.sem import evaluate ############################################################## ## Utility functions for connecting parse output to semantics ############################################################## def parse_sents(inputs, grammar, trace=0): """ Convert input sentences into syntactic trees. :param inputs: sentences to be parsed :type inputs: list of str :param grammar: ``FeatureGrammar`` or name of feature-based grammar :rtype: dict :return: a mapping from input sentences to a list of ``Tree``s """ # put imports here to avoid circult dependencies from nltk.grammar import FeatureGrammar from nltk.parse import FeatureChartParser, load_parser if isinstance(grammar, FeatureGrammar): cp = FeatureChartParser(grammar) else: cp = load_parser(grammar, trace=trace) parses = [] for sent in inputs: tokens = sent.split() # use a tokenizer? syntrees = list(cp.parse(tokens)) parses.append(syntrees) return parses def root_semrep(syntree, semkey='SEM'): """ Find the semantic representation at the root of a tree. :param syntree: a parse ``Tree`` :param semkey: the feature label to use for the root semantics in the tree :return: the semantic representation at the root of a ``Tree`` :rtype: sem.Expression """ from nltk.grammar import FeatStructNonterminal node = syntree.label() assert isinstance(node, FeatStructNonterminal) try: return node[semkey] except KeyError: print(node, end=' ') print("has no specification for the feature %s" % semkey) raise def interpret_sents(inputs, grammar, semkey='SEM', trace=0): """ Add the semantic representation to each syntactic parse tree of each input sentence. :param inputs: a list of sentences :param grammar: ``FeatureGrammar`` or name of feature-based grammar :return: a mapping from sentences to lists of pairs (parse-tree, semantic-representations) :rtype: dict """ return [[(syn, root_semrep(syn, semkey)) for syn in syntrees] for syntrees in parse_sents(inputs, grammar, trace=trace)] def evaluate_sents(inputs, grammar, model, assignment, trace=0): """ Add the truth-in-a-model value to each semantic representation for each syntactic parse of each input sentences. :param inputs: a list of sentences :param grammar: ``FeatureGrammar`` or name of feature-based grammar :return: a mapping from sentences to lists of triples (parse-tree, semantic-representations, evaluation-in-model) :rtype: dict """ return [[(syn, sem, model.evaluate("%s" % sem, assignment, trace=trace)) for (syn, sem) in interpretations] for interpretations in interpret_sents(inputs, grammar)] def demo_model0(): global m0, g0 #Initialize a valuation of non-logical constants.""" v = [('john', 'b1'), ('mary', 'g1'), ('suzie', 'g2'), ('fido', 'd1'), ('tess', 'd2'), ('noosa', 'n'), ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1', 'd2'])), ('bark', set(['d1', 'd2'])), ('walk', set(['b1', 'g2', 'd1'])), ('chase', set([('b1', 'g1'), ('b2', 'g1'), ('g1', 'd1'), ('g2', 'd2')])), ('see', set([('b1', 'g1'), ('b2', 'd2'), ('g1', 'b1'),('d2', 'b1'), ('g2', 'n')])), ('in', set([('b1', 'n'), ('b2', 'n'), ('d2', 'n')])), ('with', set([('b1', 'g1'), ('g1', 'b1'), ('d1', 'b1'), ('b1', 'd1')])) ] #Read in the data from ``v`` val = evaluate.Valuation(v) #Bind ``dom`` to the ``domain`` property of ``val`` dom = val.domain #Initialize a model with parameters ``dom`` and ``val``. m0 = evaluate.Model(dom, val) #Initialize a variable assignment with parameter ``dom`` g0 = evaluate.Assignment(dom) def read_sents(filename, encoding='utf8'): with codecs.open(filename, 'r', encoding) as fp: sents = [l.rstrip() for l in fp] # get rid of blank lines sents = [l for l in sents if len(l) > 0] sents = [l for l in sents if not l[0] == '#'] return sents def demo_legacy_grammar(): """ Check that interpret_sents() is compatible with legacy grammars that use a lowercase 'sem' feature. Define 'test.fcfg' to be the following """ from nltk.grammar import FeatureGrammar g = FeatureGrammar.fromstring(""" % start S S[sem=] -> 'hello' """) print("Reading grammar: %s" % g) print("*" * 20) for reading in interpret_sents(['hello'], g, semkey='sem'): syn, sem = reading[0] print() print("output: ", sem) def demo(): import sys from optparse import OptionParser description = \ """ Parse and evaluate some sentences. """ opts = OptionParser(description=description) opts.set_defaults(evaluate=True, beta=True, syntrace=0, semtrace=0, demo='default', grammar='', sentences='') opts.add_option("-d", "--demo", dest="demo", help="choose demo D; omit this for the default demo, or specify 'chat80'", metavar="D") opts.add_option("-g", "--gram", dest="grammar", help="read in grammar G", metavar="G") opts.add_option("-m", "--model", dest="model", help="import model M (omit '.py' suffix)", metavar="M") opts.add_option("-s", "--sentences", dest="sentences", help="read in a file of test sentences S", metavar="S") opts.add_option("-e", "--no-eval", action="store_false", dest="evaluate", help="just do a syntactic analysis") opts.add_option("-b", "--no-beta-reduction", action="store_false", dest="beta", help="don't carry out beta-reduction") opts.add_option("-t", "--syntrace", action="count", dest="syntrace", help="set syntactic tracing on; requires '-e' option") opts.add_option("-T", "--semtrace", action="count", dest="semtrace", help="set semantic tracing on") (options, args) = opts.parse_args() SPACER = '-' * 30 demo_model0() sents = [ 'Fido sees a boy with Mary', 'John sees Mary', 'every girl chases a dog', 'every boy chases a girl', 'John walks with a girl in Noosa', 'who walks'] gramfile = 'grammars/sample_grammars/sem2.fcfg' if options.sentences: sentsfile = options.sentences if options.grammar: gramfile = options.grammar if options.model: exec("import %s as model" % options.model) if sents is None: sents = read_sents(sentsfile) # Set model and assignment model = m0 g = g0 if options.evaluate: evaluations = \ evaluate_sents(sents, gramfile, model, g, trace=options.semtrace) else: semreps = \ interpret_sents(sents, gramfile, trace=options.syntrace) for i, sent in enumerate(sents): n = 1 print('\nSentence: %s' % sent) print(SPACER) if options.evaluate: for (syntree, semrep, value) in evaluations[i]: if isinstance(value, dict): value = set(value.keys()) print('%d: %s' % (n, semrep)) print(value) n += 1 else: for (syntree, semrep) in semreps[i]: print('%d: %s' % (n, semrep)) n += 1 if __name__ == "__main__": #demo() demo_legacy_grammar() nltk-3.1/nltk/sentiment/0000755000076500000240000000000012610001541015020 5ustar sbstaff00000000000000nltk-3.1/nltk/sentiment/__init__.py0000644000076500000240000000060412607522461017150 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Sentiment Analysis # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ NLTK Sentiment Analysis Package """ from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer from nltk.sentiment.vader import SentimentIntensityAnalyzer nltk-3.1/nltk/sentiment/sentiment_analyzer.py0000644000076500000240000002303312607224144021322 0ustar sbstaff00000000000000# coding: utf-8 # # Natural Language Toolkit: Sentiment Analyzer # # Copyright (C) 2001-2015 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ A SentimentAnalyzer is a tool to implement and facilitate Sentiment Analysis tasks using NLTK features and classifiers, especially for teaching and demonstrative purposes. """ from __future__ import print_function from collections import defaultdict from nltk.classify.util import apply_features, accuracy as eval_accuracy from nltk.collocations import BigramCollocationFinder from nltk.metrics import (BigramAssocMeasures, precision as eval_precision, recall as eval_recall, f_measure as eval_f_measure) from nltk.probability import FreqDist from nltk.sentiment.util import save_file, timer class SentimentAnalyzer(object): """ A Sentiment Analysis tool based on machine learning approaches. """ def __init__(self, classifier=None): self.feat_extractors = defaultdict(list) self.classifier = classifier def all_words(self, documents, labeled=None): """ Return all words/tokens from the documents (with duplicates). :param documents: a list of (words, label) tuples. :param labeled: if `True`, assume that each document is represented by a (words, label) tuple: (list(str), str). If `False`, each document is considered as being a simple list of strings: list(str). :rtype: list(str) :return: A list of all words/tokens in `documents`. """ all_words = [] if labeled is None: labeled = documents and isinstance(documents[0], tuple) if labeled == True: for words, sentiment in documents: all_words.extend(words) elif labeled == False: for words in documents: all_words.extend(words) return all_words def apply_features(self, documents, labeled=None): """ Apply all feature extractor functions to the documents. This is a wrapper around `nltk.classify.util.apply_features`. If `labeled=False`, return featuresets as: [feature_func(doc) for doc in documents] If `labeled=True`, return featuresets as: [(feature_func(tok), label) for (tok, label) in toks] :param documents: a list of documents. `If labeled=True`, the method expects a list of (words, label) tuples. :rtype: LazyMap """ return apply_features(self.extract_features, documents, labeled) def unigram_word_feats(self, words, top_n=None, min_freq=0): """ Return most common top_n word features. :param words: a list of words/tokens. :param top_n: number of best words/tokens to use, sorted by frequency. :rtype: list(str) :return: A list of `top_n` words/tokens (with no duplicates) sorted by frequency. """ # Stopwords are not removed unigram_feats_freqs = FreqDist(word for word in words) return [w for w, f in unigram_feats_freqs.most_common(top_n) if unigram_feats_freqs[w] > min_freq] def bigram_collocation_feats(self, documents, top_n=None, min_freq=3, assoc_measure=BigramAssocMeasures.pmi): """ Return `top_n` bigram features (using `assoc_measure`). Note that this method is based on bigram collocations measures, and not on simple bigram frequency. :param documents: a list (or iterable) of tokens. :param top_n: number of best words/tokens to use, sorted by association measure. :param assoc_measure: bigram association measure to use as score function. :param min_freq: the minimum number of occurrencies of bigrams to take into consideration. :return: `top_n` ngrams scored by the given association measure. """ finder = BigramCollocationFinder.from_documents(documents) finder.apply_freq_filter(min_freq) return finder.nbest(assoc_measure, top_n) def classify(self, instance): """ Classify a single instance applying the features that have already been stored in the SentimentAnalyzer. :param instance: a list (or iterable) of tokens. :return: the classification result given by applying the classifier. """ instance_feats = self.apply_features([instance], labeled=False) return self.classifier.classify(instance_feats[0]) def add_feat_extractor(self, function, **kwargs): """ Add a new function to extract features from a document. This function will be used in extract_features(). Important: in this step our kwargs are only representing additional parameters, and NOT the document we have to parse. The document will always be the first parameter in the parameter list, and it will be added in the extract_features() function. :param function: the extractor function to add to the list of feature extractors. :param kwargs: additional parameters required by the `function` function. """ self.feat_extractors[function].append(kwargs) def extract_features(self, document): """ Apply extractor functions (and their parameters) to the present document. We pass `document` as the first parameter of the extractor functions. If we want to use the same extractor function multiple times, we have to add it to the extractors with `add_feat_extractor` using multiple sets of parameters (one for each call of the extractor function). :param document: the document that will be passed as argument to the feature extractor functions. :return: A dictionary of populated features extracted from the document. :rtype: dict """ all_features = {} for extractor in self.feat_extractors: for param_set in self.feat_extractors[extractor]: feats = extractor(document, **param_set) all_features.update(feats) return all_features def train(self, trainer, training_set, save_classifier=None, **kwargs): """ Train classifier on the training set, optionally saving the output in the file specified by `save_classifier`. Additional arguments depend on the specific trainer used. For example, a MaxentClassifier can use `max_iter` parameter to specify the number of iterations, while a NaiveBayesClassifier cannot. :param trainer: `train` method of a classifier. E.g.: NaiveBayesClassifier.train :param training_set: the training set to be passed as argument to the classifier `train` method. :param save_classifier: the filename of the file where the classifier will be stored (optional). :param kwargs: additional parameters that will be passed as arguments to the classifier `train` function. :return: A classifier instance trained on the training set. """ print("Training classifier") self.classifier = trainer(training_set, **kwargs) if save_classifier: save_file(self.classifier, save_classifier) return self.classifier def evaluate(self, test_set, classifier=None, accuracy=True, f_measure=True, precision=True, recall=True, verbose=False): """ Evaluate and print classifier performance on the test set. :param test_set: A list of (tokens, label) tuples to use as gold set. :param classifier: a classifier instance (previously trained). :param accuracy: if `True`, evaluate classifier accuracy. :param f_measure: if `True`, evaluate classifier f_measure. :param precision: if `True`, evaluate classifier precision. :param recall: if `True`, evaluate classifier recall. :return: evaluation results. :rtype: dict """ if classifier is None: classifier = self.classifier print("Evaluating {0} results...".format(type(classifier).__name__)) metrics_results = {} if accuracy == True: accuracy_score = eval_accuracy(classifier, test_set) metrics_results['Accuracy'] = accuracy_score gold_results = defaultdict(set) test_results = defaultdict(set) labels = set() for i, (feats, label) in enumerate(test_set): labels.add(label) gold_results[label].add(i) observed = classifier.classify(feats) test_results[observed].add(i) for label in labels: if precision == True: precision_score = eval_precision(gold_results[label], test_results[label]) metrics_results['Precision [{0}]'.format(label)] = precision_score if recall == True: recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results['Recall [{0}]'.format(label)] = recall_score if f_measure == True: f_measure_score = eval_f_measure(gold_results[label], test_results[label]) metrics_results['F-measure [{0}]'.format(label)] = f_measure_score # Print evaluation results (in alphabetical order) if verbose == True: for result in sorted(metrics_results): print('{0}: {1}'.format(result, metrics_results[result])) return metrics_results nltk-3.1/nltk/sentiment/util.py0000644000076500000240000007442012607224144016372 0ustar sbstaff00000000000000# coding: utf-8 # # Natural Language Toolkit: Sentiment Analyzer # # Copyright (C) 2001-2015 NLTK Project # Author: Pierpaolo Pantone <24alsecondo@gmail.com> # URL: # For license information, see LICENSE.TXT """ Utility methods for Sentiment Analysis. """ from copy import deepcopy import codecs import csv import json import pickle import random import re import sys import time import nltk from nltk.corpus import CategorizedPlaintextCorpusReader from nltk.data import load from nltk.tokenize.casual import EMOTICON_RE from nltk.twitter.common import outf_writer_compat, extract_fields #//////////////////////////////////////////////////////////// #{ Regular expressions #//////////////////////////////////////////////////////////// # Regular expression for negation by Christopher Potts NEGATION = r""" (?: ^(?:never|no|nothing|nowhere|noone|none|not| havent|hasnt|hadnt|cant|couldnt|shouldnt| wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint )$ ) | n't""" NEGATION_RE = re.compile(NEGATION, re.VERBOSE) CLAUSE_PUNCT = r'^[.:;!?]$' CLAUSE_PUNCT_RE = re.compile(CLAUSE_PUNCT) # Happy and sad emoticons HAPPY = set([ ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}', ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D', '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P', 'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)', '<3' ]) SAD = set([ ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<', ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c', ':c', ':{', '>:\\', ';(' ]) def timer(method): """ A timer decorator to measure execution performance of methods. """ def timed(*args, **kw): start = time.time() result = method(*args, **kw) end = time.time() tot_time = end - start hours = int(tot_time / 3600) mins = int((tot_time / 60) % 60) # in Python 2.x round() will return a float, so we convert it to int secs = int(round(tot_time % 60)) if hours == 0 and mins == 0 and secs < 10: print('[TIMER] {0}(): {:.3f} seconds'.format(method.__name__, tot_time)) else: print('[TIMER] {0}(): {1}h {2}m {3}s'.format(method.__name__, hours, mins, secs)) return result return timed #//////////////////////////////////////////////////////////// #{ Feature extractor functions #//////////////////////////////////////////////////////////// """ Feature extractor functions are declared outside the SentimentAnalyzer class. Users should have the possibility to create their own feature extractors without modifying SentimentAnalyzer. """ def extract_unigram_feats(document, unigrams, handle_negation=False): """ Populate a dictionary of unigram features, reflecting the presence/absence in the document of each of the tokens in `unigrams`. :param document: a list of words/tokens. :param unigrams: a list of words/tokens whose presence/absence has to be checked in `document`. :param handle_negation: if `handle_negation == True` apply `mark_negation` method to `document` before checking for unigram presence/absence. :return: a dictionary of unigram features {unigram : boolean}. >>> words = ['ice', 'police', 'riot'] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_unigram_feats(document, words).items()) [('contains(ice)', True), ('contains(police)', False), ('contains(riot)', False)] """ features = {} if handle_negation: document = mark_negation(document) for word in unigrams: features['contains({0})'.format(word)] = word in set(document) return features def extract_bigram_feats(document, bigrams): """ Populate a dictionary of bigram features, reflecting the presence/absence in the document of each of the tokens in `bigrams`. This extractor function only considers contiguous bigrams obtained by `nltk.bigrams`. :param document: a list of words/tokens. :param unigrams: a list of bigrams whose presence/absence has to be checked in `document`. :return: a dictionary of bigram features {bigram : boolean}. >>> bigrams = [('global', 'warming'), ('police', 'prevented'), ('love', 'you')] >>> document = 'ice is melting due to global warming'.split() >>> sorted(extract_bigram_feats(document, bigrams).items()) [('contains(global - warming)', True), ('contains(love - you)', False), ('contains(police - prevented)', False)] """ features = {} for bigr in bigrams: features['contains({0} - {1})'.format(bigr[0], bigr[1])] = bigr in nltk.bigrams(document) return features #//////////////////////////////////////////////////////////// #{ Helper Functions #//////////////////////////////////////////////////////////// def mark_negation(document, double_neg_flip=False, shallow=False): """ Append _NEG suffix to words that appear in the scope between a negation and a punctuation mark. :param document: a list of words/tokens, or a tuple (words, label). :param shallow: if True, the method will modify the original document in place. :param double_neg_flip: if True, double negation is considered affirmation (we activate/deactivate negation scope everytime we find a negation). :return: if `shallow == True` the method will modify the original document and return it. If `shallow == False` the method will return a modified document, leaving the original unmodified. >>> sent = "I didn't like this movie . It was bad .".split() >>> mark_negation(sent) ['I', "didn't", 'like_NEG', 'this_NEG', 'movie_NEG', '.', 'It', 'was', 'bad', '.'] """ if not shallow: document = deepcopy(document) # check if the document is labeled. If so, do not consider the label. labeled = document and isinstance(document[0], (tuple, list)) if labeled: doc = document[0] else: doc = document neg_scope = False for i, word in enumerate(doc): if NEGATION_RE.search(word): if not neg_scope or (neg_scope and double_neg_flip): neg_scope = not neg_scope continue else: doc[i] += '_NEG' elif neg_scope and CLAUSE_PUNCT_RE.search(word): neg_scope = not neg_scope elif neg_scope and not CLAUSE_PUNCT_RE.search(word): doc[i] += '_NEG' return document def output_markdown(filename, **kwargs): """ Write the output of an analysis to a file. """ with codecs.open(filename, 'at') as outfile: text = '\n*** \n\n' text += '{0} \n\n'.format(time.strftime("%d/%m/%Y, %H:%M")) for k in sorted(kwargs): if isinstance(kwargs[k], dict): dictionary = kwargs[k] text += ' - **{0}:**\n'.format(k) for entry in sorted(dictionary): text += ' - {0}: {1} \n'.format(entry, dictionary[entry]) elif isinstance(kwargs[k], list): text += ' - **{0}:**\n'.format(k) for entry in kwargs[k]: text += ' - {0}\n'.format(entry) else: text += ' - **{0}:** {1} \n'.format(k, kwargs[k]) outfile.write(text) def save_file(content, filename): """ Store `content` in `filename`. Can be used to store a SentimentAnalyzer. """ print("Saving", filename) with codecs.open(filename, 'wb') as storage_file: # The protocol=2 parameter is for python2 compatibility pickle.dump(content, storage_file, protocol=2) def split_train_test(all_instances, n=None): """ Randomly split `n` instances of the dataset into train and test sets. :param all_instances: a list of instances (e.g. documents) that will be split. :param n: the number of instances to consider (in case we want to use only a subset). :return: two lists of instances. Train set is 8/10 of the total and test set is 2/10 of the total. """ random.seed(12345) random.shuffle(all_instances) if not n or n > len(all_instances): n = len(all_instances) train_set = all_instances[:int(.8*n)] test_set = all_instances[int(.8*n):n] return train_set, test_set def _show_plot(x_values, y_values, x_labels=None, y_labels=None): try: import matplotlib.pyplot as plt except ImportError: raise ImportError('The plot function requires matplotlib to be installed.' 'See http://matplotlib.org/') plt.locator_params(axis='y', nbins=3) axes = plt.axes() axes.yaxis.grid() plt.plot(x_values, y_values, 'ro', color='red') plt.ylim(ymin=-1.2, ymax=1.2) plt.tight_layout(pad=5) if x_labels: plt.xticks(x_values, x_labels, rotation='vertical') if y_labels: plt.yticks([-1, 0, 1], y_labels, rotation='horizontal') # Pad margins so that markers are not clipped by the axes plt.margins(0.2) plt.show() #//////////////////////////////////////////////////////////// #{ Parsing and conversion functions #//////////////////////////////////////////////////////////// def json2csv_preprocess(json_file, outfile, fields, encoding='utf8', errors='replace', gzip_compress=False, skip_retweets=True, skip_tongue_tweets=True, skip_ambiguous_tweets=True, strip_off_emoticons=True, remove_duplicates=True, limit=None): """ Convert json file to csv file, preprocessing each row to obtain a suitable dataset for tweets Semantic Analysis. :param json_file: the original json file containing tweets. :param outfile: the output csv filename. :param fields: a list of fields that will be extracted from the json file and kept in the output csv file. :param encoding: the encoding of the files. :param errors: the error handling strategy for the output writer. :param gzip_compress: if True, create a compressed GZIP file. :param skip_retweets: if True, remove retweets. :param skip_tongue_tweets: if True, remove tweets containing ":P" and ":-P" emoticons. :param skip_ambiguous_tweets: if True, remove tweets containing both happy and sad emoticons. :param strip_off_emoticons: if True, strip off emoticons from all tweets. :param remove_duplicates: if True, remove tweets appearing more than once. :param limit: an integer to set the number of tweets to convert. After the limit is reached the conversion will stop. It can be useful to create subsets of the original tweets json data. """ with codecs.open(json_file, encoding=encoding) as fp: (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress) # write the list of fields as header writer.writerow(fields) if remove_duplicates == True: tweets_cache = [] i = 0 for line in fp: tweet = json.loads(line) row = extract_fields(tweet, fields) try: text = row[fields.index('text')] # Remove retweets if skip_retweets == True: if re.search(r'\bRT\b', text): continue # Remove tweets containing ":P" and ":-P" emoticons if skip_tongue_tweets == True: if re.search(r'\:\-?P\b', text): continue # Remove tweets containing both happy and sad emoticons if skip_ambiguous_tweets == True: all_emoticons = EMOTICON_RE.findall(text) if all_emoticons: if (set(all_emoticons) & HAPPY) and (set(all_emoticons) & SAD): continue # Strip off emoticons from all tweets if strip_off_emoticons == True: row[fields.index('text')] = re.sub(r'(?!\n)\s+', ' ', EMOTICON_RE.sub('', text)) # Remove duplicate tweets if remove_duplicates == True: if row[fields.index('text')] in tweets_cache: continue else: tweets_cache.append(row[fields.index('text')]) except ValueError: pass writer.writerow(row) i += 1 if limit and i >= limit: break outf.close() def parse_tweets_set(filename, label, word_tokenizer=None, sent_tokenizer=None, skip_header=True): """ Parse csv file containing tweets and output data a list of (text, label) tuples. :param filename: the input csv filename. :param label: the label to be appended to each tweet contained in the csv file. :param word_tokenizer: the tokenizer instance that will be used to tokenize each sentence into tokens (e.g. WordPunctTokenizer() or BlanklineTokenizer()). If no word_tokenizer is specified, tweets will not be tokenized. :param sent_tokenizer: the tokenizer that will be used to split each tweet into sentences. :param skip_header: if True, skip the first line of the csv file (which usually contains headers). :return: a list of (text, label) tuples. """ tweets = [] if not sent_tokenizer: sent_tokenizer = load('tokenizers/punkt/english.pickle') # If we use Python3.x we can proceed using the 'rt' flag if sys.version_info[0] == 3: with codecs.open(filename, 'rt') as csvfile: reader = csv.reader(csvfile) if skip_header == True: next(reader, None) # skip the header i = 0 for tweet_id, text in reader: # text = text[1] i += 1 sys.stdout.write('Loaded {0} tweets\r'.format(i)) # Apply sentence and word tokenizer to text if word_tokenizer: tweet = [w for sent in sent_tokenizer.tokenize(text) for w in word_tokenizer.tokenize(sent)] else: tweet = text tweets.append((tweet, label)) # If we use Python2.x we need to handle encoding problems elif sys.version_info[0] < 3: with codecs.open(filename) as csvfile: reader = csv.reader(csvfile) if skip_header == True: next(reader, None) # skip the header i = 0 for row in reader: unicode_row = [x.decode('utf8') for x in row] text = unicode_row[1] i += 1 sys.stdout.write('Loaded {0} tweets\r'.format(i)) # Apply sentence and word tokenizer to text if word_tokenizer: tweet = [w.encode('utf8') for sent in sent_tokenizer.tokenize(text) for w in word_tokenizer.tokenize(sent)] else: tweet = text tweets.append((tweet, label)) print("Loaded {0} tweets".format(i)) return tweets #//////////////////////////////////////////////////////////// #{ Demos #//////////////////////////////////////////////////////////// def demo_tweets(trainer, n_instances=None, output=None): """ Train and test Naive Bayes classifier on 10000 tweets, tokenized using TweetTokenizer. Features are composed of: - 1000 most frequent unigrams - 100 top bigrams (using BigramAssocMeasures.pmi) :param trainer: `train` method of a classifier. :param n_instances: the number of total tweets that have to be used for training and testing. Tweets will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.tokenize import TweetTokenizer from sentiment_analyzer import SentimentAnalyzer from nltk.corpus import twitter_samples, stopwords # Different customizations for the TweetTokenizer tokenizer = TweetTokenizer(preserve_case=False) # tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True) # tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True) if n_instances is not None: n_instances = int(n_instances/2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, limit=n_instances) neg_docs = parse_tweets_set(negative_csv, label='neg', word_tokenizer=tokenizer) pos_docs = parse_tweets_set(positive_csv, label='pos', word_tokenizer=tokenizer) # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs+train_neg_docs testing_tweets = test_pos_docs+test_neg_docs sentim_analyzer = SentimentAnalyzer() # stopwords = stopwords.words('english') # all_words = [word for word in sentim_analyzer.all_words(training_tweets) if word.lower() not in stopwords] all_words = [word for word in sentim_analyzer.all_words(training_tweets)] # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, top_n=1000) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Add bigram collocation features bigram_collocs_feats = sentim_analyzer.bigram_collocation_feats([tweet[0] for tweet in training_tweets], top_n=100, min_freq=12) sentim_analyzer.add_feat_extractor(extract_bigram_feats, bigrams=bigram_collocs_feats) training_set = sentim_analyzer.apply_features(training_tweets) test_set = sentim_analyzer.apply_features(testing_tweets) classifier = sentim_analyzer.train(trainer, training_set) # classifier = sentim_analyzer.train(trainer, training_set, max_iter=4) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='labeled_tweets', Classifier=type(classifier).__name__, Tokenizer=tokenizer.__class__.__name__, Feats=extr, Results=results, Instances=n_instances) def demo_movie_reviews(trainer, n_instances=None, output=None): """ Train classifier on all instances of the Movie Reviews dataset. The corpus has been preprocessed using the default sentence tokenizer and WordPunctTokenizer. Features are composed of: - most frequent unigrams :param trainer: `train` method of a classifier. :param n_instances: the number of total reviews that have to be used for training and testing. Reviews will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from nltk.corpus import movie_reviews from sentiment_analyzer import SentimentAnalyzer if n_instances is not None: n_instances = int(n_instances/2) pos_docs = [(list(movie_reviews.words(pos_id)), 'pos') for pos_id in movie_reviews.fileids('pos')[:n_instances]] neg_docs = [(list(movie_reviews.words(neg_id)), 'neg') for neg_id in movie_reviews.fileids('neg')[:n_instances]] # We separately split positive and negative instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_docs = train_pos_docs+train_neg_docs testing_docs = test_pos_docs+test_neg_docs sentim_analyzer = SentimentAnalyzer() all_words = sentim_analyzer.all_words(training_docs) # Add simple unigram word features unigram_feats = sentim_analyzer.unigram_word_feats(all_words, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='Movie_reviews', Classifier=type(classifier).__name__, Tokenizer='WordPunctTokenizer', Feats=extr, Results=results, Instances=n_instances) def demo_subjectivity(trainer, save_analyzer=False, n_instances=None, output=None): """ Train and test a classifier on instances of the Subjective Dataset by Pang and Lee. The dataset is made of 5000 subjective and 5000 objective sentences. All tokens (words and punctuation marks) are separated by a whitespace, so we use the basic WhitespaceTokenizer to parse the data. :param trainer: `train` method of a classifier. :param save_analyzer: if `True`, store the SentimentAnalyzer in a pickle file. :param n_instances: the number of total sentences that have to be used for training and testing. Sentences will be equally split between positive and negative. :param output: the output file where results have to be reported. """ from sentiment_analyzer import SentimentAnalyzer from nltk.corpus import subjectivity if n_instances is not None: n_instances = int(n_instances/2) subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_subj_docs, test_subj_docs = split_train_test(subj_docs) train_obj_docs, test_obj_docs = split_train_test(obj_docs) training_docs = train_subj_docs+train_obj_docs testing_docs = test_subj_docs+test_obj_docs sentim_analyzer = SentimentAnalyzer() all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) # Add simple unigram word features handling negation unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) # Apply features to obtain a feature-value representation of our datasets training_set = sentim_analyzer.apply_features(training_docs) test_set = sentim_analyzer.apply_features(testing_docs) classifier = sentim_analyzer.train(trainer, training_set) try: classifier.show_most_informative_features() except AttributeError: print('Your classifier does not provide a show_most_informative_features() method.') results = sentim_analyzer.evaluate(test_set) if save_analyzer == True: save_file(sentim_analyzer, 'sa_subjectivity.pickle') if output: extr = [f.__name__ for f in sentim_analyzer.feat_extractors] output_markdown(output, Dataset='subjectivity', Classifier=type(classifier).__name__, Tokenizer='WhitespaceTokenizer', Feats=extr, Instances=n_instances, Results=results) return sentim_analyzer def demo_sent_subjectivity(text): """ Classify a single sentence as subjective or objective using a stored SentimentAnalyzer. :param text: a sentence whose subjectivity has to be classified. """ from nltk.classify import NaiveBayesClassifier from nltk.tokenize import regexp word_tokenizer = regexp.WhitespaceTokenizer() try: sentim_analyzer = load('sa_subjectivity.pickle') except LookupError: print('Cannot find the sentiment analyzer you want to load.') print('Training a new one using NaiveBayesClassifier.') sentim_analyzer = demo_subjectivity(NaiveBayesClassifier.train, True) # Tokenize and convert to lower case tokenized_text = [word.lower() for word in word_tokenizer.tokenize(text)] print(sentim_analyzer.classify(tokenized_text)) def demo_liu_hu_lexicon(sentence, plot=False): """ Basic example of sentiment classification using Liu and Hu opinion lexicon. This function simply counts the number of positive, negative and neutral words in the sentence and classifies it depending on which polarity is more represented. Words that do not appear in the lexicon are considered as neutral. :param sentence: a sentence whose polarity has to be classified. :param plot: if True, plot a visual representation of the sentence polarity. """ from nltk.corpus import opinion_lexicon from nltk.tokenize import treebank tokenizer = treebank.TreebankWordTokenizer() pos_words = 0 neg_words = 0 tokenized_sent = [word.lower() for word in tokenizer.tokenize(sentence)] x = list(range(len(tokenized_sent))) # x axis for the plot y = [] for word in tokenized_sent: if word in opinion_lexicon.positive(): pos_words += 1 y.append(1) # positive elif word in opinion_lexicon.negative(): neg_words += 1 y.append(-1) # negative else: y.append(0) # neutral if pos_words > neg_words: print('Positive') elif pos_words < neg_words: print('Negative') elif pos_words == neg_words: print('Neutral') if plot == True: _show_plot(x, y, x_labels=tokenized_sent, y_labels=['Negative', 'Neutral', 'Positive']) def demo_vader_instance(text): """ Output polarity scores for a text using Vader approach. :param text: a text whose polarity has to be evaluated. """ from vader import SentimentIntensityAnalyzer vader_analyzer = SentimentIntensityAnalyzer() print(vader_analyzer.polarity_scores(text)) def demo_vader_tweets(n_instances=None, output=None): """ Classify 10000 positive and negative tweets using Vader approach. :param n_instances: the number of total tweets that have to be classified. :param output: the output file where results have to be reported. """ from collections import defaultdict from nltk.corpus import twitter_samples from vader import SentimentIntensityAnalyzer from nltk.metrics import (accuracy as eval_accuracy, precision as eval_precision, recall as eval_recall, f_measure as eval_f_measure) if n_instances is not None: n_instances = int(n_instances/2) fields = ['id', 'text'] positive_json = twitter_samples.abspath("positive_tweets.json") positive_csv = 'positive_tweets.csv' json2csv_preprocess(positive_json, positive_csv, fields, strip_off_emoticons=False, limit=n_instances) negative_json = twitter_samples.abspath("negative_tweets.json") negative_csv = 'negative_tweets.csv' json2csv_preprocess(negative_json, negative_csv, fields, strip_off_emoticons=False, limit=n_instances) pos_docs = parse_tweets_set(positive_csv, label='pos') neg_docs = parse_tweets_set(negative_csv, label='neg') # We separately split subjective and objective instances to keep a balanced # uniform class distribution in both train and test sets. train_pos_docs, test_pos_docs = split_train_test(pos_docs) train_neg_docs, test_neg_docs = split_train_test(neg_docs) training_tweets = train_pos_docs+train_neg_docs testing_tweets = test_pos_docs+test_neg_docs vader_analyzer = SentimentIntensityAnalyzer() gold_results = defaultdict(set) test_results = defaultdict(set) acc_gold_results = [] acc_test_results = [] labels = set() num = 0 for i, (text, label) in enumerate(testing_tweets): labels.add(label) gold_results[label].add(i) acc_gold_results.append(label) score = vader_analyzer.polarity_scores(text)['compound'] if score > 0: observed = 'pos' else: observed = 'neg' num += 1 acc_test_results.append(observed) test_results[observed].add(i) metrics_results = {} for label in labels: accuracy_score = eval_accuracy(acc_gold_results, acc_test_results) metrics_results['Accuracy'] = accuracy_score precision_score = eval_precision(gold_results[label], test_results[label]) metrics_results['Precision [{0}]'.format(label)] = precision_score recall_score = eval_recall(gold_results[label], test_results[label]) metrics_results['Recall [{0}]'.format(label)] = recall_score f_measure_score = eval_f_measure(gold_results[label], test_results[label]) metrics_results['F-measure [{0}]'.format(label)] = f_measure_score for result in sorted(metrics_results): print('{0}: {1}'.format(result, metrics_results[result])) if output: output_markdown(output, Approach='Vader', Dataset='labeled_tweets', Instances=n_instances, Results=metrics_results) if __name__ == '__main__': from nltk.classify import NaiveBayesClassifier, MaxentClassifier from nltk.classify.scikitlearn import SklearnClassifier from sklearn.svm import LinearSVC naive_bayes = NaiveBayesClassifier.train svm = SklearnClassifier(LinearSVC()).train maxent = MaxentClassifier.train demo_tweets(naive_bayes) # demo_movie_reviews(svm) # demo_subjectivity(svm) # demo_sent_subjectivity("she's an artist , but hasn't picked up a brush in a year . ") # demo_liu_hu_lexicon("This movie was actually neither that funny, nor super witty.", plot=True) # demo_vader_instance("This movie was actually neither that funny, nor super witty.") # demo_vader_tweets() nltk-3.1/nltk/sentiment/vader.py0000644000076500000240000004273112607522773016527 0ustar sbstaff00000000000000# coding: utf-8 # Natural Language Toolkit: vader # # Copyright (C) 2001-2015 NLTK Project # Author: C.J. Hutto # Ewan Klein (modifications) # Pierpaolo Pantone <24alsecondo@gmail.com> (modifications) # URL: # For license information, see LICENSE.TXT # # Modifications to the original VADER code have been made in order to # integrate it into NLTK. These have involved changes to # ensure Python 3 compatibility, and refactoring to achieve greater modularity. """ If you use the VADER sentiment analysis tools, please cite: Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. """ import codecs import math import os import re import string ##Constants## # (empirically derived mean sentiment intensity rating increase for booster words) B_INCR = 0.293 B_DECR = -0.293 # (empirically derived mean sentiment intensity rating increase for using # ALLCAPs to emphasize a word) C_INCR = 0.733 N_SCALAR = -0.74 # for removing punctuation REGEX_REMOVE_PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation)) PUNC_LIST = [".", "!", "?", ",", ";", ":", "-", "'", "\"", "!!", "!!!", "??", "???", "?!?", "!?!", "?!?!", "!?!?"] NEGATE = \ ["aint", "arent", "cannot", "cant", "couldnt", "darent", "didnt", "doesnt", "ain't", "aren't", "can't", "couldn't", "daren't", "didn't", "doesn't", "dont", "hadnt", "hasnt", "havent", "isnt", "mightnt", "mustnt", "neither", "don't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "neednt", "needn't", "never", "none", "nope", "nor", "not", "nothing", "nowhere", "oughtnt", "shant", "shouldnt", "uhuh", "wasnt", "werent", "oughtn't", "shan't", "shouldn't", "uh-uh", "wasn't", "weren't", "without", "wont", "wouldnt", "won't", "wouldn't", "rarely", "seldom", "despite"] # booster/dampener 'intensifiers' or 'degree adverbs' # http://en.wiktionary.org/wiki/Category:English_degree_adverbs BOOSTER_DICT = \ {"absolutely": B_INCR, "amazingly": B_INCR, "awfully": B_INCR, "completely": B_INCR, "considerably": B_INCR, "decidedly": B_INCR, "deeply": B_INCR, "effing": B_INCR, "enormously": B_INCR, "entirely": B_INCR, "especially": B_INCR, "exceptionally": B_INCR, "extremely": B_INCR, "fabulously": B_INCR, "flipping": B_INCR, "flippin": B_INCR, "fricking": B_INCR, "frickin": B_INCR, "frigging": B_INCR, "friggin": B_INCR, "fully": B_INCR, "fucking": B_INCR, "greatly": B_INCR, "hella": B_INCR, "highly": B_INCR, "hugely": B_INCR, "incredibly": B_INCR, "intensely": B_INCR, "majorly": B_INCR, "more": B_INCR, "most": B_INCR, "particularly": B_INCR, "purely": B_INCR, "quite": B_INCR, "really": B_INCR, "remarkably": B_INCR, "so": B_INCR, "substantially": B_INCR, "thoroughly": B_INCR, "totally": B_INCR, "tremendously": B_INCR, "uber": B_INCR, "unbelievably": B_INCR, "unusually": B_INCR, "utterly": B_INCR, "very": B_INCR, "almost": B_DECR, "barely": B_DECR, "hardly": B_DECR, "just enough": B_DECR, "kind of": B_DECR, "kinda": B_DECR, "kindof": B_DECR, "kind-of": B_DECR, "less": B_DECR, "little": B_DECR, "marginally": B_DECR, "occasionally": B_DECR, "partly": B_DECR, "scarcely": B_DECR, "slightly": B_DECR, "somewhat": B_DECR, "sort of": B_DECR, "sorta": B_DECR, "sortof": B_DECR, "sort-of": B_DECR} # check for special case idioms using a sentiment-laden keyword known to SAGE SPECIAL_CASE_IDIOMS = {"the shit": 3, "the bomb": 3, "bad ass": 1.5, "yeah right": -2, "cut the mustard": 2, "kiss of death": -1.5, "hand to mouth": -2} ##Static methods## def negated(input_words, include_nt=True): """ Determine if input contains negation words """ neg_words = [] neg_words.extend(NEGATE) for word in neg_words: if word in input_words: return True if include_nt: for word in input_words: if "n't" in word: return True if "least" in input_words: i = input_words.index("least") if i > 0 and input_words[i-1] != "at": return True return False def normalize(score, alpha=15): """ Normalize the score to be between -1 and 1 using an alpha that approximates the max expected value """ norm_score = score/math.sqrt((score*score) + alpha) return norm_score def allcap_differential(words): """ Check whether just some words in the input are ALL CAPS :param list words: The words to inspect :returns: `True` if some but not all items in `words` are ALL CAPS """ is_different = False allcap_words = 0 for word in words: if word.isupper(): allcap_words += 1 cap_differential = len(words) - allcap_words if cap_differential > 0 and cap_differential < len(words): is_different = True return is_different def scalar_inc_dec(word, valence, is_cap_diff): """ Check if the preceding words increase, decrease, or negate/nullify the valence """ scalar = 0.0 word_lower = word.lower() if word_lower in BOOSTER_DICT: scalar = BOOSTER_DICT[word_lower] if valence < 0: scalar *= -1 #check if booster/dampener word is in ALLCAPS (while others aren't) if word.isupper() and is_cap_diff: if valence > 0: scalar += C_INCR else: scalar -= C_INCR return scalar class SentiText(object): """ Identify sentiment-relevant string-level properties of input text. """ def __init__(self, text): if not isinstance(text, str): text = str(text.encode('utf-8')) self.text = text self.words_and_emoticons = self._words_and_emoticons() # doesn't separate words from\ # adjacent punctuation (keeps emoticons & contractions) self.is_cap_diff = allcap_differential(self.words_and_emoticons) def _words_only(self): text_mod = REGEX_REMOVE_PUNCTUATION.sub('', self.text) # removes punctuation (but loses emoticons & contractions) words_only = text_mod.split() # get rid of empty items or single letter "words" like 'a' and 'I' words_only = [word for word in words_only if len(word) > 1] return words_only def _words_and_emoticons(self): wes = self.text.split() # get rid of residual empty items or single letter words wes = [we for we in wes if len(we) > 1] for word in self._words_only(): for punct in PUNC_LIST: pword = punct + word x1 = wes.count(pword) while x1 > 0: i = wes.index(pword) wes.remove(pword) wes.insert(i, word) x1 = wes.count(pword) wordp = word + punct x2 = wes.count(wordp) while x2 > 0: i = wes.index(wordp) wes.remove(wordp) wes.insert(i, word) x2 = wes.count(wordp) return wes class SentimentIntensityAnalyzer(object): """ Give a sentiment intensity score to sentences. """ def __init__(self, lexicon_file="vader_lexicon.txt"): self.lexicon_file = os.path.join(os.path.dirname(__file__), lexicon_file) self.lexicon = self.make_lex_dict() def make_lex_dict(self): """ Convert lexicon file to a dictionary """ lex_dict = {} with codecs.open(self.lexicon_file, encoding='utf8') as infile: for line in infile: (word, measure) = line.strip().split('\t')[0:2] lex_dict[word] = float(measure) return lex_dict def polarity_scores(self, text): """ Return a float for sentiment strength based on the input text. Positive values are positive valence, negative value are negative valence. """ sentitext = SentiText(text) #text, words_and_emoticons, is_cap_diff = self.preprocess(text) sentiments = [] words_and_emoticons = sentitext.words_and_emoticons for item in words_and_emoticons: valence = 0 i = words_and_emoticons.index(item) if (i < len(words_and_emoticons) - 1 and item.lower() == "kind" and \ words_and_emoticons[i+1].lower() == "of") or \ item.lower() in BOOSTER_DICT: sentiments.append(valence) continue sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments) sentiments = self._but_check(words_and_emoticons, sentiments) return self.score_valence(sentiments, text) def sentiment_valence(self, valence, sentitext, item, i, sentiments): is_cap_diff = sentitext.is_cap_diff words_and_emoticons = sentitext.words_and_emoticons item_lowercase = item.lower() if item_lowercase in self.lexicon: #get the sentiment valence valence = self.lexicon[item_lowercase] #check if sentiment laden word is in ALL CAPS (while others aren't) if item.isupper() and is_cap_diff: if valence > 0: valence += C_INCR else: valence -= C_INCR for start_i in range(0,3): if i > start_i and words_and_emoticons[i-(start_i+1)].lower() not in self.lexicon: # dampen the scalar modifier of preceding words and emoticons # (excluding the ones that immediately preceed the item) based # on their distance from the current item. s = scalar_inc_dec(words_and_emoticons[i-(start_i+1)], valence, is_cap_diff) if start_i == 1 and s != 0: s = s*0.95 if start_i == 2 and s != 0: s = s*0.9 valence = valence+s valence = self._never_check(valence, words_and_emoticons, start_i, i) if start_i == 2: valence = self._idioms_check(valence, words_and_emoticons, i) # future work: consider other sentiment-laden idioms # other_idioms = # {"back handed": -2, "blow smoke": -2, "blowing smoke": -2, # "upper hand": 1, "break a leg": 2, # "cooking with gas": 2, "in the black": 2, "in the red": -2, # "on the ball": 2,"under the weather": -2} valence = self._least_check(valence, words_and_emoticons, i) sentiments.append(valence) return sentiments def _least_check(self, valence, words_and_emoticons, i): # check for negation case using "least" if i > 1 and words_and_emoticons[i-1].lower() not in self.lexicon \ and words_and_emoticons[i-1].lower() == "least": if words_and_emoticons[i-2].lower() != "at" and words_and_emoticons[i-2].lower() != "very": valence = valence*N_SCALAR elif i > 0 and words_and_emoticons[i-1].lower() not in self.lexicon \ and words_and_emoticons[i-1].lower() == "least": valence = valence*N_SCALAR return valence def _but_check(self, words_and_emoticons, sentiments): # check for modification in sentiment due to contrastive conjunction 'but' if 'but' in words_and_emoticons or 'BUT' in words_and_emoticons: try: bi = words_and_emoticons.index('but') except ValueError: bi = words_and_emoticons.index('BUT') for sentiment in sentiments: si = sentiments.index(sentiment) if si < bi: sentiments.pop(si) sentiments.insert(si, sentiment*0.5) elif si > bi: sentiments.pop(si) sentiments.insert(si, sentiment*1.5) return sentiments def _idioms_check(self, valence, words_and_emoticons, i): onezero = "{0} {1}".format(words_and_emoticons[i-1], words_and_emoticons[i]) twoonezero = "{0} {1} {2}".format(words_and_emoticons[i-2], words_and_emoticons[i-1], words_and_emoticons[i]) twoone = "{0} {1}".format(words_and_emoticons[i-2], words_and_emoticons[i-1]) threetwoone = "{0} {1} {2}".format(words_and_emoticons[i-3], words_and_emoticons[i-2], words_and_emoticons[i-1]) threetwo = "{0} {1}".format(words_and_emoticons[i-3], words_and_emoticons[i-2]) sequences = [onezero, twoonezero, twoone, threetwoone, threetwo] for seq in sequences: if seq in SPECIAL_CASE_IDIOMS: valence = SPECIAL_CASE_IDIOMS[seq] break if len(words_and_emoticons)-1 > i: zeroone = "{0} {1}".format(words_and_emoticons[i], words_and_emoticons[i+1]) if zeroone in SPECIAL_CASE_IDIOMS: valence = SPECIAL_CASE_IDIOMS[zeroone] if len(words_and_emoticons)-1 > i+1: zeroonetwo = "{0} {1} {2}".format(words_and_emoticons[i], words_and_emoticons[i+1], words_and_emoticons[i+2]) if zeroonetwo in SPECIAL_CASE_IDIOMS: valence = SPECIAL_CASE_IDIOMS[zeroonetwo] # check for booster/dampener bi-grams such as 'sort of' or 'kind of' if threetwo in BOOSTER_DICT or twoone in BOOSTER_DICT: valence = valence+B_DECR return valence def _never_check(self, valence, words_and_emoticons, start_i, i): if start_i == 0: if negated([words_and_emoticons[i-1]]): valence = valence*N_SCALAR if start_i == 1: if words_and_emoticons[i-2] == "never" and\ (words_and_emoticons[i-1] == "so" or words_and_emoticons[i-1] == "this"): valence = valence*1.5 elif negated([words_and_emoticons[i-(start_i+1)]]): valence = valence*N_SCALAR if start_i == 2: if words_and_emoticons[i-3] == "never" and \ (words_and_emoticons[i-2] == "so" or words_and_emoticons[i-2] == "this") or \ (words_and_emoticons[i-1] == "so" or words_and_emoticons[i-1] == "this"): valence = valence*1.25 elif negated([words_and_emoticons[i-(start_i+1)]]): valence = valence*N_SCALAR return valence def _punctuation_emphasis(self, sum_s, text): # add emphasis from exclamation points and question marks ep_amplifier = self._amplify_ep(text) qm_amplifier = self._amplify_qm(text) punct_emph_amplifier = ep_amplifier+qm_amplifier return punct_emph_amplifier def _amplify_ep(self, text): # check for added emphasis resulting from exclamation points (up to 4 of them) ep_count = text.count("!") if ep_count > 4: ep_count = 4 # (empirically derived mean sentiment intensity rating increase for # exclamation points) ep_amplifier = ep_count*0.292 return ep_amplifier def _amplify_qm(self, text): # check for added emphasis resulting from question marks (2 or 3+) qm_count = text.count("?") qm_amplifier = 0 if qm_count > 1: if qm_count <= 3: # (empirically derived mean sentiment intensity rating increase for # question marks) qm_amplifier = qm_count*0.18 else: qm_amplifier = 0.96 return qm_amplifier def _sift_sentiment_scores(self, sentiments): # want separate positive versus negative sentiment scores pos_sum = 0.0 neg_sum = 0.0 neu_count = 0 for sentiment_score in sentiments: if sentiment_score > 0: pos_sum += (float(sentiment_score) +1) # compensates for neutral words that are counted as 1 if sentiment_score < 0: neg_sum += (float(sentiment_score) -1) # when used with math.fabs(), compensates for neutrals if sentiment_score == 0: neu_count += 1 return pos_sum, neg_sum, neu_count def score_valence(self, sentiments, text): if sentiments: sum_s = float(sum(sentiments)) # compute and add emphasis from punctuation in text punct_emph_amplifier = self._punctuation_emphasis(sum_s, text) if sum_s > 0: sum_s += punct_emph_amplifier elif sum_s < 0: sum_s -= punct_emph_amplifier compound = normalize(sum_s) # discriminate between positive, negative and neutral sentiment scores pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments) if pos_sum > math.fabs(neg_sum): pos_sum += (punct_emph_amplifier) elif pos_sum < math.fabs(neg_sum): neg_sum -= (punct_emph_amplifier) total = pos_sum + math.fabs(neg_sum) + neu_count pos = math.fabs(pos_sum / total) neg = math.fabs(neg_sum / total) neu = math.fabs(neu_count / total) else: compound = 0.0 pos = 0.0 neg = 0.0 neu = 0.0 sentiment_dict = \ {"neg" : round(neg, 3), "neu" : round(neu, 3), "pos" : round(pos, 3), "compound" : round(compound, 4)} return sentiment_dict nltk-3.1/nltk/stem/0000755000076500000240000000000012610001541013762 5ustar sbstaff00000000000000nltk-3.1/nltk/stem/__init__.py0000644000076500000240000000217112607224144016110 0ustar sbstaff00000000000000# Natural Language Toolkit: Stemmers # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT """ NLTK Stemmers Interfaces used to remove morphological affixes from words, leaving only the word stem. Stemming algorithms aim to remove those affixes required for eg. grammatical role, tense, derivational morphology leaving only the stem of the word. This is a difficult problem due to irregular words (eg. common verbs in English), complicated morphological rules, and part-of-speech and sense ambiguities (eg. ``ceil-`` is not the stem of ``ceiling``). StemmerI defines a standard interface for stemmers. """ from nltk.stem.api import StemmerI from nltk.stem.regexp import RegexpStemmer from nltk.stem.lancaster import LancasterStemmer from nltk.stem.isri import ISRIStemmer from nltk.stem.porter import PorterStemmer from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.stem.rslp import RSLPStemmer nltk-3.1/nltk/stem/api.py0000644000076500000240000000124112607224144015117 0ustar sbstaff00000000000000# Natural Language Toolkit: Stemmer Interface # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT class StemmerI(object): """ A processing interface for removing morphological affixes from words. This process is known as stemming. """ def stem(self, token): """ Strip affixes from the token and return the stem. :param token: The token that should be stemmed. :type token: str """ raise NotImplementedError() nltk-3.1/nltk/stem/isri.py0000644000076500000240000003530012607224144015317 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # # Natural Language Toolkit: The ISRI Arabic Stemmer # # Copyright (C) 2001-2015 NLTK Proejct # Algorithm: Kazem Taghva, Rania Elkhoury, and Jeffrey Coombs (2005) # Author: Hosam Algasaier # URL: # For license information, see LICENSE.TXT """ ISRI Arabic Stemmer The algorithm for this stemmer is described in: Taghva, K., Elkoury, R., and Coombs, J. 2005. Arabic Stemming without a root dictionary. Information Science Research Institute. University of Nevada, Las Vegas, USA. The Information Science Research Institute’s (ISRI) Arabic stemmer shares many features with the Khoja stemmer. However, the main difference is that ISRI stemmer does not use root dictionary. Also, if a root is not found, ISRI stemmer returned normalized form, rather than returning the original unmodified word. Additional adjustments were made to improve the algorithm: 1- Adding 60 stop words. 2- Adding the pattern (ØªÙØ§Ø¹ÙŠÙ„) to ISRI pattern set. 3- The step 2 in the original algorithm was normalizing all hamza. This step is discarded because it increases the word ambiguities and changes the original root. """ from __future__ import unicode_literals import re from nltk.stem.api import StemmerI class ISRIStemmer(StemmerI): ''' ISRI Arabic stemmer based on algorithm: Arabic Stemming without a root dictionary. Information Science Research Institute. University of Nevada, Las Vegas, USA. A few minor modifications have been made to ISRI basic algorithm. See the source code of this module for more information. isri.stem(token) returns Arabic root for the given token. The ISRI Stemmer requires that all tokens have Unicode string types. If you use Python IDLE on Arabic Windows you have to decode text first using Arabic '1256' coding. ''' def __init__(self): # length three prefixes self.p3 = ['\u0643\u0627\u0644', '\u0628\u0627\u0644', '\u0648\u0644\u0644', '\u0648\u0627\u0644'] # length two prefixes self.p2 = ['\u0627\u0644', '\u0644\u0644'] # length one prefixes self.p1 = ['\u0644', '\u0628', '\u0641', '\u0633', '\u0648', '\u064a', '\u062a', '\u0646', '\u0627'] # length three suffixes self.s3 = ['\u062a\u0645\u0644', '\u0647\u0645\u0644', '\u062a\u0627\u0646', '\u062a\u064a\u0646', '\u0643\u0645\u0644'] # length two suffixes self.s2 = ['\u0648\u0646', '\u0627\u062a', '\u0627\u0646', '\u064a\u0646', '\u062a\u0646', '\u0643\u0645', '\u0647\u0646', '\u0646\u0627', '\u064a\u0627', '\u0647\u0627', '\u062a\u0645', '\u0643\u0646', '\u0646\u064a', '\u0648\u0627', '\u0645\u0627', '\u0647\u0645'] # length one suffixes self.s1 = ['\u0629', '\u0647', '\u064a', '\u0643', '\u062a', '\u0627', '\u0646'] # groups of length four patterns self.pr4 = {0: ['\u0645'], 1: ['\u0627'], 2: ['\u0627', '\u0648', '\u064A'], 3: ['\u0629']} # Groups of length five patterns and length three roots self.pr53 = {0: ['\u0627', '\u062a'], 1: ['\u0627', '\u064a', '\u0648'], 2: ['\u0627', '\u062a', '\u0645'], 3: ['\u0645', '\u064a', '\u062a'], 4: ['\u0645', '\u062a'], 5: ['\u0627', '\u0648'], 6: ['\u0627', '\u0645']} self.re_short_vowels = re.compile(r'[\u064B-\u0652]') self.re_hamza = re.compile(r'[\u0621\u0624\u0626]') self.re_initial_hamza = re.compile(r'^[\u0622\u0623\u0625]') self.stop_words = ['\u064a\u0643\u0648\u0646', '\u0648\u0644\u064a\u0633', '\u0648\u0643\u0627\u0646', '\u0643\u0630\u0644\u0643', '\u0627\u0644\u062a\u064a', '\u0648\u0628\u064a\u0646', '\u0639\u0644\u064a\u0647\u0627', '\u0645\u0633\u0627\u0621', '\u0627\u0644\u0630\u064a', '\u0648\u0643\u0627\u0646\u062a', '\u0648\u0644\u0643\u0646', '\u0648\u0627\u0644\u062a\u064a', '\u062a\u0643\u0648\u0646', '\u0627\u0644\u064a\u0648\u0645', '\u0627\u0644\u0644\u0630\u064a\u0646', '\u0639\u0644\u064a\u0647', '\u0643\u0627\u0646\u062a', '\u0644\u0630\u0644\u0643', '\u0623\u0645\u0627\u0645', '\u0647\u0646\u0627\u0643', '\u0645\u0646\u0647\u0627', '\u0645\u0627\u0632\u0627\u0644', '\u0644\u0627\u0632\u0627\u0644', '\u0644\u0627\u064a\u0632\u0627\u0644', '\u0645\u0627\u064a\u0632\u0627\u0644', '\u0627\u0635\u0628\u062d', '\u0623\u0635\u0628\u062d', '\u0623\u0645\u0633\u0649', '\u0627\u0645\u0633\u0649', '\u0623\u0636\u062d\u0649', '\u0627\u0636\u062d\u0649', '\u0645\u0627\u0628\u0631\u062d', '\u0645\u0627\u0641\u062a\u0626', '\u0645\u0627\u0627\u0646\u0641\u0643', '\u0644\u0627\u0633\u064a\u0645\u0627', '\u0648\u0644\u0627\u064a\u0632\u0627\u0644', '\u0627\u0644\u062d\u0627\u0644\u064a', '\u0627\u0644\u064a\u0647\u0627', '\u0627\u0644\u0630\u064a\u0646', '\u0641\u0627\u0646\u0647', '\u0648\u0627\u0644\u0630\u064a', '\u0648\u0647\u0630\u0627', '\u0644\u0647\u0630\u0627', '\u0641\u0643\u0627\u0646', '\u0633\u062a\u0643\u0648\u0646', '\u0627\u0644\u064a\u0647', '\u064a\u0645\u0643\u0646', '\u0628\u0647\u0630\u0627', '\u0627\u0644\u0630\u0649'] def stem(self, token): """ Stemming a word token using the ISRI stemmer. """ token = self.norm(token, 1) # remove diacritics which representing Arabic short vowels if token in self.stop_words: return token # exclude stop words from being processed token = self.pre32(token) # remove length three and length two prefixes in this order token = self.suf32(token) # remove length three and length two suffixes in this order token = self.waw(token) # remove connective ‘و’ if it precedes a word beginning with ‘و’ token = self.norm(token, 2) # normalize initial hamza to bare alif # if 4 <= word length <= 7, then stem; otherwise, no stemming if len(token) == 4: # length 4 word token = self.pro_w4(token) elif len(token) == 5: # length 5 word token = self.pro_w53(token) token = self.end_w5(token) elif len(token) == 6: # length 6 word token = self.pro_w6(token) token = self.end_w6(token) elif len(token) == 7: # length 7 word token = self.suf1(token) if len(token) == 7: token = self.pre1(token) if len(token) == 6: token = self.pro_w6(token) token = self.end_w6(token) return token def norm(self, word, num=3): """ normalization: num=1 normalize diacritics num=2 normalize initial hamza num=3 both 1&2 """ if num == 1: word = self.re_short_vowels.sub('', word) elif num == 2: word = self.re_initial_hamza.sub('\u0627', word) elif num == 3: word = self.re_short_vowels.sub('', word) word = self.re_initial_hamza.sub('\u0627', word) return word def pre32(self, word): """remove length three and length two prefixes in this order""" if len(word) >= 6: for pre3 in self.p3: if word.startswith(pre3): return word[3:] if len(word) >= 5: for pre2 in self.p2: if word.startswith(pre2): return word[2:] return word def suf32(self, word): """remove length three and length two suffixes in this order""" if len(word) >= 6: for suf3 in self.s3: if word.endswith(suf3): return word[:-3] if len(word) >= 5: for suf2 in self.s2: if word.endswith(suf2): return word[:-2] return word def waw(self, word): """remove connective ‘و’ if it precedes a word beginning with ‘و’ """ if len(word) >= 4 and word[:2] == '\u0648\u0648': word = word[1:] return word def pro_w4(self, word): """process length four patterns and extract length three roots""" if word[0] in self.pr4[0]: # Ù…ÙØ¹Ù„ word = word[1:] elif word[1] in self.pr4[1]: # ÙØ§Ø¹Ù„ word = word[:1] + word[2:] elif word[2] in self.pr4[2]: # ÙØ¹Ø§Ù„ - ÙØ¹ÙˆÙ„ - ÙØ¹ÙŠÙ„ word = word[:2] + word[3] elif word[3] in self.pr4[3]: # ÙØ¹Ù„Ø© word = word[:-1] else: word = self.suf1(word) # do - normalize short sufix if len(word) == 4: word = self.pre1(word) # do - normalize short prefix return word def pro_w53(self, word): """process length five patterns and extract length three roots""" if word[2] in self.pr53[0] and word[0] == '\u0627': # Ø§ÙØªØ¹Ù„ - Ø§ÙØ§Ø¹Ù„ word = word[1] + word[3:] elif word[3] in self.pr53[1] and word[0] == '\u0645': # Ù…ÙØ¹ÙˆÙ„ - Ù…ÙØ¹Ø§Ù„ - Ù…ÙØ¹ÙŠÙ„ word = word[1:3] + word[4] elif word[0] in self.pr53[2] and word[4] == '\u0629': # Ù…ÙØ¹Ù„Ø© - ØªÙØ¹Ù„Ø© - Ø§ÙØ¹Ù„Ø© word = word[1:4] elif word[0] in self.pr53[3] and word[2] == '\u062a': # Ù…ÙØªØ¹Ù„ - ÙŠÙØªØ¹Ù„ - ØªÙØªØ¹Ù„ word = word[1] + word[3:] elif word[0] in self.pr53[4] and word[2] == '\u0627': # Ù…ÙØ§Ø¹Ù„ - ØªÙØ§Ø¹Ù„ word = word[1] + word[3:] elif word[2] in self.pr53[5] and word[4] == '\u0629': # ÙØ¹ÙˆÙ„Ø© - ÙØ¹Ø§Ù„Ø© word = word[:2] + word[3] elif word[0] in self.pr53[6] and word[1] == '\u0646': # Ø§Ù†ÙØ¹Ù„ - Ù…Ù†ÙØ¹Ù„ word = word[2:] elif word[3] == '\u0627' and word[0] == '\u0627': # Ø§ÙØ¹Ø§Ù„ word = word[1:3] + word[4] elif word[4] == '\u0646' and word[3] == '\u0627': # ÙØ¹Ù„ان word = word[:3] elif word[3] == '\u064a' and word[0] == '\u062a': # ØªÙØ¹ÙŠÙ„ word = word[1:3] + word[4] elif word[3] == '\u0648' and word[1] == '\u0627': # ÙØ§Ø¹ÙˆÙ„ word = word[0] + word[2] + word[4] elif word[2] == '\u0627' and word[1] == '\u0648': # Ùواعل word = word[0] + word[3:] elif word[3] == '\u0626' and word[2] == '\u0627': # ÙØ¹Ø§Ø¦Ù„ word = word[:2] + word[4] elif word[4] == '\u0629' and word[1] == '\u0627': # ÙØ§Ø¹Ù„Ø© word = word[0] + word[2:4] elif word[4] == '\u064a' and word[2] == '\u0627': # ÙØ¹Ø§Ù„ÙŠ word = word[:2] + word[3] else: word = self.suf1(word) # do - normalize short sufix if len(word) == 5: word = self.pre1(word) # do - normalize short prefix return word def pro_w54(self, word): """process length five patterns and extract length four roots""" if word[0] in self.pr53[2]: # ØªÙØ¹Ù„Ù„ - Ø§ÙØ¹Ù„Ù„ - Ù…ÙØ¹Ù„Ù„ word = word[1:] elif word[4] == '\u0629': # ÙØ¹Ù„لة word = word[:4] elif word[2] == '\u0627': # ÙØ¹Ø§Ù„Ù„ word = word[:2] + word[3:] return word def end_w5(self, word): """ending step (word of length five)""" if len(word) == 4: word = self.pro_w4(word) elif len(word) == 5: word = self.pro_w54(word) return word def pro_w6(self, word): """process length six patterns and extract length three roots""" if word.startswith('\u0627\u0633\u062a') or word.startswith('\u0645\u0633\u062a'): # Ù…Ø³ØªÙØ¹Ù„ - Ø§Ø³ØªÙØ¹Ù„ word = word[3:] elif word[0] == '\u0645' and word[3] == '\u0627' and word[5] == '\u0629': # Ù…ÙØ¹Ø§Ù„Ø© word = word[1:3] + word[4] elif word[0] == '\u0627' and word[2] == '\u062a' and word[4] == '\u0627': # Ø§ÙØªØ¹Ø§Ù„ word = word[1] + word[3] + word[5] elif word[0] == '\u0627' and word[3] == '\u0648' and word[2] == word[4]: # Ø§ÙØ¹ÙˆØ¹Ù„ word = word[1] + word[4:] elif word[0] == '\u062a' and word[2] == '\u0627' and word[4] == '\u064a': # ØªÙØ§Ø¹ÙŠÙ„ new pattern word = word[1] + word[3] + word[5] else: word = self.suf1(word) # do - normalize short sufix if len(word) == 6: word = self.pre1(word) # do - normalize short prefix return word def pro_w64(self, word): """process length six patterns and extract length four roots""" if word[0] == '\u0627' and word[4] == '\u0627': # Ø§ÙØ¹Ù„ال word = word[1:4] + word[5] elif word.startswith('\u0645\u062a'): # Ù…ØªÙØ¹Ù„Ù„ word = word[2:] return word def end_w6(self, word): """ending step (word of length six)""" if len(word) == 5: word = self.pro_w53(word) word = self.end_w5(word) elif len(word) == 6: word = self.pro_w64(word) return word def suf1(self, word): """normalize short sufix""" for sf1 in self.s1: if word.endswith(sf1): return word[:-1] return word def pre1(self, word): """normalize short prefix""" for sp1 in self.p1: if word.startswith(sp1): return word[1:] return word nltk-3.1/nltk/stem/lancaster.py0000644000076500000240000002574212607224144016336 0ustar sbstaff00000000000000# Natural Language Toolkit: Stemmers # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Tomcavage # URL: # For license information, see LICENSE.TXT """ A word stemmer based on the Lancaster stemming algorithm. Paice, Chris D. "Another Stemmer." ACM SIGIR Forum 24.3 (1990): 56-61. """ from __future__ import unicode_literals import re from nltk.stem.api import StemmerI from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class LancasterStemmer(StemmerI): """ Lancaster Stemmer >>> from nltk.stem.lancaster import LancasterStemmer >>> st = LancasterStemmer() >>> st.stem('maximum') # Remove "-um" when word is intact 'maxim' >>> st.stem('presumably') # Don't remove "-um" when word is not intact 'presum' >>> st.stem('multiply') # No action taken if word ends with "-ply" 'multiply' >>> st.stem('provision') # Replace "-sion" with "-j" to trigger "j" set of rules 'provid' >>> st.stem('owed') # Word starting with vowel must contain at least 2 letters 'ow' >>> st.stem('ear') # ditto 'ear' >>> st.stem('saying') # Words starting with consonant must contain at least 3 'say' >>> st.stem('crying') # letters and one of those letters must be a vowel 'cry' >>> st.stem('string') # ditto 'string' >>> st.stem('meant') # ditto 'meant' >>> st.stem('cement') # ditto 'cem' """ # The rule list is static since it doesn't change between instances rule_tuple = ( "ai*2.", # -ia > - if intact "a*1.", # -a > - if intact "bb1.", # -bb > -b "city3s.", # -ytic > -ys "ci2>", # -ic > - "cn1t>", # -nc > -nt "dd1.", # -dd > -d "dei3y>", # -ied > -y "deec2ss.", # -ceed >", -cess "dee1.", # -eed > -ee "de2>", # -ed > - "dooh4>", # -hood > - "e1>", # -e > - "feil1v.", # -lief > -liev "fi2>", # -if > - "gni3>", # -ing > - "gai3y.", # -iag > -y "ga2>", # -ag > - "gg1.", # -gg > -g "ht*2.", # -th > - if intact "hsiug5ct.", # -guish > -ct "hsi3>", # -ish > - "i*1.", # -i > - if intact "i1y>", # -i > -y "ji1d.", # -ij > -id -- see nois4j> & vis3j> "juf1s.", # -fuj > -fus "ju1d.", # -uj > -ud "jo1d.", # -oj > -od "jeh1r.", # -hej > -her "jrev1t.", # -verj > -vert "jsim2t.", # -misj > -mit "jn1d.", # -nj > -nd "j1s.", # -j > -s "lbaifi6.", # -ifiabl > - "lbai4y.", # -iabl > -y "lba3>", # -abl > - "lbi3.", # -ibl > - "lib2l>", # -bil > -bl "lc1.", # -cl > c "lufi4y.", # -iful > -y "luf3>", # -ful > - "lu2.", # -ul > - "lai3>", # -ial > - "lau3>", # -ual > - "la2>", # -al > - "ll1.", # -ll > -l "mui3.", # -ium > - "mu*2.", # -um > - if intact "msi3>", # -ism > - "mm1.", # -mm > -m "nois4j>", # -sion > -j "noix4ct.", # -xion > -ct "noi3>", # -ion > - "nai3>", # -ian > - "na2>", # -an > - "nee0.", # protect -een "ne2>", # -en > - "nn1.", # -nn > -n "pihs4>", # -ship > - "pp1.", # -pp > -p "re2>", # -er > - "rae0.", # protect -ear "ra2.", # -ar > - "ro2>", # -or > - "ru2>", # -ur > - "rr1.", # -rr > -r "rt1>", # -tr > -t "rei3y>", # -ier > -y "sei3y>", # -ies > -y "sis2.", # -sis > -s "si2>", # -is > - "ssen4>", # -ness > - "ss0.", # protect -ss "suo3>", # -ous > - "su*2.", # -us > - if intact "s*1>", # -s > - if intact "s0.", # -s > -s "tacilp4y.", # -plicat > -ply "ta2>", # -at > - "tnem4>", # -ment > - "tne3>", # -ent > - "tna3>", # -ant > - "tpir2b.", # -ript > -rib "tpro2b.", # -orpt > -orb "tcud1.", # -duct > -duc "tpmus2.", # -sumpt > -sum "tpec2iv.", # -cept > -ceiv "tulo2v.", # -olut > -olv "tsis0.", # protect -sist "tsi3>", # -ist > - "tt1.", # -tt > -t "uqi3.", # -iqu > - "ugo1.", # -ogu > -og "vis3j>", # -siv > -j "vie0.", # protect -eiv "vi2>", # -iv > - "ylb1>", # -bly > -bl "yli3y>", # -ily > -y "ylp0.", # protect -ply "yl2>", # -ly > - "ygo1.", # -ogy > -og "yhp1.", # -phy > -ph "ymo1.", # -omy > -om "ypo1.", # -opy > -op "yti3>", # -ity > - "yte3>", # -ety > - "ytl2.", # -lty > -l "yrtsi5.", # -istry > - "yra3>", # -ary > - "yro3>", # -ory > - "yfi3.", # -ify > - "ycn2t>", # -ncy > -nt "yca3>", # -acy > - "zi2>", # -iz > - "zy1s." # -yz > -ys ) def __init__(self): """Create an instance of the Lancaster stemmer. """ # Setup an empty rule dictionary - this will be filled in later self.rule_dictionary = {} def parseRules(self, rule_tuple): """Validate the set of rules used in this stemmer. """ valid_rule = re.compile("^[a-z]+\*?\d[a-z]*[>\.]?$") # Empty any old rules from the rule set before adding new ones self.rule_dictionary = {} for rule in rule_tuple: if not valid_rule.match(rule): raise ValueError("The rule %s is invalid" % rule) first_letter = rule[0:1] if first_letter in self.rule_dictionary: self.rule_dictionary[first_letter].append(rule) else: self.rule_dictionary[first_letter] = [rule] def stem(self, word): """Stem a word using the Lancaster stemmer. """ # Lower-case the word, since all the rules are lower-cased word = word.lower() # Save a copy of the original word intact_word = word # If the user hasn't supplied any rules, setup the default rules if len(self.rule_dictionary) == 0: self.parseRules(LancasterStemmer.rule_tuple) return self.__doStemming(word, intact_word) def __doStemming(self, word, intact_word): """Perform the actual word stemming """ valid_rule = re.compile("^([a-z]+)(\*?)(\d)([a-z]*)([>\.]?)$") proceed = True while proceed: # Find the position of the last letter of the word to be stemmed last_letter_position = self.__getLastLetter(word) # Only stem the word if it has a last letter and a rule matching that last letter if last_letter_position < 0 or word[last_letter_position] not in self.rule_dictionary: proceed = False else: rule_was_applied = False # Go through each rule that matches the word's final letter for rule in self.rule_dictionary[word[last_letter_position]]: rule_match = valid_rule.match(rule) if rule_match: (ending_string, intact_flag, remove_total, append_string, cont_flag) = rule_match.groups() # Convert the number of chars to remove when stemming # from a string to an integer remove_total = int(remove_total) # Proceed if word's ending matches rule's word ending if word.endswith(ending_string[::-1]): if intact_flag: if (word == intact_word and self.__isAcceptable(word, remove_total)): word = self.__applyRule(word, remove_total, append_string) rule_was_applied = True if cont_flag == '.': proceed = False break elif self.__isAcceptable(word, remove_total): word = self.__applyRule(word, remove_total, append_string) rule_was_applied = True if cont_flag == '.': proceed = False break # If no rules apply, the word doesn't need any more stemming if rule_was_applied == False: proceed = False return word def __getLastLetter(self, word): """Get the zero-based index of the last alphabetic character in this string """ last_letter = -1 for position in range(len(word)): if word[position].isalpha(): last_letter = position else: break return last_letter def __isAcceptable(self, word, remove_total): """Determine if the word is acceptable for stemming. """ word_is_acceptable = False # If the word starts with a vowel, it must be at least 2 # characters long to be stemmed if word[0] in "aeiouy": if (len(word) - remove_total >= 2): word_is_acceptable = True # If the word starts with a consonant, it must be at least 3 # characters long (including one vowel) to be stemmed elif (len(word) - remove_total >= 3): if word[1] in "aeiouy": word_is_acceptable = True elif word[2] in "aeiouy": word_is_acceptable = True return word_is_acceptable def __applyRule(self, word, remove_total, append_string): """Apply the stemming rule to the word """ # Remove letters from the end of the word new_word_length = len(word) - remove_total word = word[0:new_word_length] # And add new letters to the end of the truncated word if append_string: word += append_string return word def __repr__(self): return '' nltk-3.1/nltk/stem/porter.py0000644000076500000240000005725212607224144015676 0ustar sbstaff00000000000000# Copyright (c) 2002 Vivake Gupta (vivakeATomniscia.org). All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA # # This software is maintained by Vivake (vivakeATomniscia.org) and is available at: # http://www.omniscia.org/~vivake/python/PorterStemmer.py # # Additional modifications were made to incorporate this module into # NLTK. All such modifications are marked with "--NLTK--". The NLTK # version of this module is maintained by NLTK developers, # and is available via http://nltk.org/ # # GNU Linking Exception: # Using this module statically or dynamically with other modules is # making a combined work based on this module. Thus, the terms and # conditions of the GNU General Public License cover the whole combination. # As a special exception, the copyright holders of this module give # you permission to combine this module with independent modules to # produce an executable program, regardless of the license terms of these # independent modules, and to copy and distribute the resulting # program under terms of your choice, provided that you also meet, # for each linked independent module, the terms and conditions of # the license of that module. An independent module is a module which # is not derived from or based on this module. If you modify this module, # you may extend this exception to your version of the module, but you # are not obliged to do so. If you do not wish to do so, delete this # exception statement from your version. """ Porter Stemmer This is the Porter stemming algorithm, ported to Python from the version coded up in ANSI C by the author. It follows the algorithm presented in Porter, M. "An algorithm for suffix stripping." Program 14.3 (1980): 130-137. only differing from it at the points marked --DEPARTURE-- and --NEW-- below. For a more faithful version of the Porter algorithm, see http://www.tartarus.org/~martin/PorterStemmer/ Later additions: June 2000 The 'l' of the 'logi' -> 'log' rule is put with the stem, so that short stems like 'geo' 'theo' etc work like 'archaeo' 'philo' etc. This follows a suggestion of Barry Wilkins, research student at Birmingham. February 2000 the cvc test for not dropping final -e now looks after vc at the beginning of a word, so are, eve, ice, ore, use keep final -e. In this test c is any consonant, including w, x and y. This extension was suggested by Chris Emerson. -fully -> -ful treated like -fulness -> -ful, and -tionally -> -tion treated like -tional -> -tion both in Step 2. These were suggested by Hiranmay Ghosh, of New Delhi. Invariants proceed, succeed, exceed. Also suggested by Hiranmay Ghosh. Additional modifications were made to incorperate this module into nltk. All such modifications are marked with \"--NLTK--\". """ from __future__ import print_function, unicode_literals ## --NLTK-- ## Declare this module's documentation format. __docformat__ = 'plaintext' import re from nltk.stem.api import StemmerI from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class PorterStemmer(StemmerI): ## --NLTK-- ## Add a module docstring """ A word stemmer based on the Porter stemming algorithm. Porter, M. \"An algorithm for suffix stripping.\" Program 14.3 (1980): 130-137. A few minor modifications have been made to Porter's basic algorithm. See the source code of this module for more information. The Porter Stemmer requires that all tokens have string types. """ # The main part of the stemming algorithm starts here. # Note that only lower case sequences are stemmed. Forcing to lower case # should be done before stem(...) is called. def __init__(self): ## --NEW-- ## This is a table of irregular forms. It is quite short, but still ## reflects the errors actually drawn to Martin Porter's attention over ## a 20 year period! ## ## Extend it as necessary. ## ## The form of the table is: ## { ## "p1" : ["s11","s12","s13", ... ], ## "p2" : ["s21","s22","s23", ... ], ## ... ## "pn" : ["sn1","sn2","sn3", ... ] ## } ## ## String sij is mapped to paradigm form pi, and the main stemming ## process is then bypassed. irregular_forms = { "sky" : ["sky", "skies"], "die" : ["dying"], "lie" : ["lying"], "tie" : ["tying"], "news" : ["news"], "inning" : ["innings", "inning"], "outing" : ["outings", "outing"], "canning" : ["cannings", "canning"], "howe" : ["howe"], # --NEW-- "proceed" : ["proceed"], "exceed" : ["exceed"], "succeed" : ["succeed"], # Hiranmay Ghosh } self.pool = {} for key in irregular_forms: for val in irregular_forms[key]: self.pool[val] = key self.vowels = frozenset(['a', 'e', 'i', 'o', 'u']) def _cons(self, word, i): """cons(i) is TRUE <=> b[i] is a consonant.""" if word[i] in self.vowels: return False if word[i] == 'y': if i == 0: return True else: return (not self._cons(word, i - 1)) return True def _m(self, word, j): """m() measures the number of consonant sequences between k0 and j. if c is a consonant sequence and v a vowel sequence, and <..> indicates arbitrary presence, gives 0 vc gives 1 vcvc gives 2 vcvcvc gives 3 .... """ n = 0 i = 0 while True: if i > j: return n if not self._cons(word, i): break i = i + 1 i = i + 1 while True: while True: if i > j: return n if self._cons(word, i): break i = i + 1 i = i + 1 n = n + 1 while True: if i > j: return n if not self._cons(word, i): break i = i + 1 i = i + 1 def _vowelinstem(self, stem): """vowelinstem(stem) is TRUE <=> stem contains a vowel""" for i in range(len(stem)): if not self._cons(stem, i): return True return False def _doublec(self, word): """doublec(word) is TRUE <=> word ends with a double consonant""" if len(word) < 2: return False if (word[-1] != word[-2]): return False return self._cons(word, len(word)-1) def _cvc(self, word, i): """cvc(i) is TRUE <=> a) ( --NEW--) i == 1, and word[0] word[1] is vowel consonant, or b) word[i - 2], word[i - 1], word[i] has the form consonant - vowel - consonant and also if the second c is not w, x or y. this is used when trying to restore an e at the end of a short word. e.g. cav(e), lov(e), hop(e), crim(e), but snow, box, tray. """ if i == 0: return False # i == 0 never happens perhaps if i == 1: return (not self._cons(word, 0) and self._cons(word, 1)) if not self._cons(word, i) or self._cons(word, i-1) or not self._cons(word, i-2): return False ch = word[i] if ch == 'w' or ch == 'x' or ch == 'y': return False return True def _step1ab(self, word): """step1ab() gets rid of plurals and -ed or -ing. e.g. caresses -> caress ponies -> poni sties -> sti tie -> tie (--NEW--: see below) caress -> caress cats -> cat feed -> feed agreed -> agree disabled -> disable matting -> mat mating -> mate meeting -> meet milling -> mill messing -> mess meetings -> meet """ if word[-1] == 's': if word.endswith("sses"): word = word[:-2] elif word.endswith("ies"): if len(word) == 4: word = word[:-1] # this line extends the original algorithm, so that # 'flies'->'fli' but 'dies'->'die' etc else: word = word[:-2] elif word[-2] != 's': word = word[:-1] ed_or_ing_trimmed = False if word.endswith("ied"): if len(word) == 4: word = word[:-1] else: word = word[:-2] # this line extends the original algorithm, so that # 'spied'->'spi' but 'died'->'die' etc elif word.endswith("eed"): if self._m(word, len(word)-4) > 0: word = word[:-1] elif word.endswith("ed") and self._vowelinstem(word[:-2]): word = word[:-2] ed_or_ing_trimmed = True elif word.endswith("ing") and self._vowelinstem(word[:-3]): word = word[:-3] ed_or_ing_trimmed = True if ed_or_ing_trimmed: if word.endswith("at") or word.endswith("bl") or word.endswith("iz"): word += 'e' elif self._doublec(word): if word[-1] not in ['l', 's', 'z']: word = word[:-1] elif (self._m(word, len(word)-1) == 1 and self._cvc(word, len(word)-1)): word += 'e' return word def _step1c(self, word): """step1c() turns terminal y to i when there is another vowel in the stem. --NEW--: This has been modified from the original Porter algorithm so that y->i is only done when y is preceded by a consonant, but not if the stem is only a single consonant, i.e. (*c and not c) Y -> I So 'happy' -> 'happi', but 'enjoy' -> 'enjoy' etc This is a much better rule. Formerly 'enjoy'->'enjoi' and 'enjoyment'-> 'enjoy'. Step 1c is perhaps done too soon; but with this modification that no longer really matters. Also, the removal of the vowelinstem(z) condition means that 'spy', 'fly', 'try' ... stem to 'spi', 'fli', 'tri' and conflate with 'spied', 'tried', 'flies' ... """ if word[-1] == 'y' and len(word) > 2 and self._cons(word, len(word) - 2): return word[:-1] + 'i' else: return word def _step2(self, word): """step2() maps double suffices to single ones. so -ization ( = -ize plus -ation) maps to -ize etc. note that the string before the suffix must give m() > 0. """ if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed' return word ch = word[-2] if ch == 'a': if word.endswith("ational"): return word[:-7] + "ate" if self._m(word, len(word)-8) > 0 else word elif word.endswith("tional"): return word[:-2] if self._m(word, len(word)-7) > 0 else word else: return word elif ch == 'c': if word.endswith("enci"): return word[:-4] + "ence" if self._m(word, len(word)-5) > 0 else word elif word.endswith("anci"): return word[:-4] + "ance" if self._m(word, len(word)-5) > 0 else word else: return word elif ch == 'e': if word.endswith("izer"): return word[:-1] if self._m(word, len(word)-5) > 0 else word else: return word elif ch == 'l': if word.endswith("bli"): return word[:-3] + "ble" if self._m(word, len(word)-4) > 0 else word # --DEPARTURE-- # To match the published algorithm, replace "bli" with "abli" and "ble" with "able" elif word.endswith("alli"): # --NEW-- if self._m(word, len(word)-5) > 0: word = word[:-2] return self._step2(word) else: return word elif word.endswith("fulli"): return word[:-2] if self._m(word, len(word)-6) else word # --NEW-- elif word.endswith("entli"): return word[:-2] if self._m(word, len(word)-6) else word elif word.endswith("eli"): return word[:-2] if self._m(word, len(word)-4) else word elif word.endswith("ousli"): return word[:-2] if self._m(word, len(word)-6) else word else: return word elif ch == 'o': if word.endswith("ization"): return word[:-7] + "ize" if self._m(word, len(word)-8) else word elif word.endswith("ation"): return word[:-5] + "ate" if self._m(word, len(word)-6) else word elif word.endswith("ator"): return word[:-4] + "ate" if self._m(word, len(word)-5) else word else: return word elif ch == 's': if word.endswith("alism"): return word[:-3] if self._m(word, len(word)-6) else word elif word.endswith("ness"): if word.endswith("iveness"): return word[:-4] if self._m(word, len(word)-8) else word elif word.endswith("fulness"): return word[:-4] if self._m(word, len(word)-8) else word elif word.endswith("ousness"): return word[:-4] if self._m(word, len(word)-8) else word else: return word else: return word elif ch == 't': if word.endswith("aliti"): return word[:-3] if self._m(word, len(word)-6) else word elif word.endswith("iviti"): return word[:-5] + "ive" if self._m(word, len(word)-6) else word elif word.endswith("biliti"): return word[:-6] + "ble" if self._m(word, len(word)-7) else word else: return word elif ch == 'g': # --DEPARTURE-- if word.endswith("logi"): return word[:-1] if self._m(word, len(word) - 4) else word # --NEW-- (Barry Wilkins) # To match the published algorithm, pass len(word)-5 to _m instead of len(word)-4 else: return word else: return word def _step3(self, word): """step3() deals with -ic-, -full, -ness etc. similar strategy to step2.""" ch = word[-1] if ch == 'e': if word.endswith("icate"): return word[:-3] if self._m(word, len(word)-6) else word elif word.endswith("ative"): return word[:-5] if self._m(word, len(word)-6) else word elif word.endswith("alize"): return word[:-3] if self._m(word, len(word)-6) else word else: return word elif ch == 'i': if word.endswith("iciti"): return word[:-3] if self._m(word, len(word)-6) else word else: return word elif ch == 'l': if word.endswith("ical"): return word[:-2] if self._m(word, len(word)-5) else word elif word.endswith("ful"): return word[:-3] if self._m(word, len(word)-4) else word else: return word elif ch == 's': if word.endswith("ness"): return word[:-4] if self._m(word, len(word)-5) else word else: return word else: return word def _step4(self, word): """step4() takes off -ant, -ence etc., in context vcvc.""" if len(word) <= 1: # Only possible at this stage given unusual inputs to stem_word like 'oed' return word ch = word[-2] if ch == 'a': if word.endswith("al"): return word[:-2] if self._m(word, len(word)-3) > 1 else word else: return word elif ch == 'c': if word.endswith("ance"): return word[:-4] if self._m(word, len(word)-5) > 1 else word elif word.endswith("ence"): return word[:-4] if self._m(word, len(word)-5) > 1 else word else: return word elif ch == 'e': if word.endswith("er"): return word[:-2] if self._m(word, len(word)-3) > 1 else word else: return word elif ch == 'i': if word.endswith("ic"): return word[:-2] if self._m(word, len(word)-3) > 1 else word else: return word elif ch == 'l': if word.endswith("able"): return word[:-4] if self._m(word, len(word)-5) > 1 else word elif word.endswith("ible"): return word[:-4] if self._m(word, len(word)-5) > 1 else word else: return word elif ch == 'n': if word.endswith("ant"): return word[:-3] if self._m(word, len(word)-4) > 1 else word elif word.endswith("ement"): return word[:-5] if self._m(word, len(word)-6) > 1 else word elif word.endswith("ment"): return word[:-4] if self._m(word, len(word)-5) > 1 else word elif word.endswith("ent"): return word[:-3] if self._m(word, len(word)-4) > 1 else word else: return word elif ch == 'o': if word.endswith("sion") or word.endswith("tion"): # slightly different logic to all the other cases return word[:-3] if self._m(word, len(word)-4) > 1 else word elif word.endswith("ou"): return word[:-2] if self._m(word, len(word)-3) > 1 else word else: return word elif ch == 's': if word.endswith("ism"): return word[:-3] if self._m(word, len(word)-4) > 1 else word else: return word elif ch == 't': if word.endswith("ate"): return word[:-3] if self._m(word, len(word)-4) > 1 else word elif word.endswith("iti"): return word[:-3] if self._m(word, len(word)-4) > 1 else word else: return word elif ch == 'u': if word.endswith("ous"): return word[:-3] if self._m(word, len(word)-4) > 1 else word else: return word elif ch == 'v': if word.endswith("ive"): return word[:-3] if self._m(word, len(word)-4) > 1 else word else: return word elif ch == 'z': if word.endswith("ize"): return word[:-3] if self._m(word, len(word)-4) > 1 else word else: return word else: return word def _step5(self, word): """step5() removes a final -e if m() > 1, and changes -ll to -l if m() > 1. """ if word[-1] == 'e': a = self._m(word, len(word)-1) if a > 1 or (a == 1 and not self._cvc(word, len(word)-2)): word = word[:-1] if word.endswith('ll') and self._m(word, len(word)-1) > 1: word = word[:-1] return word def stem_word(self, p, i=0, j=None): """ Returns the stem of p, or, if i and j are given, the stem of p[i:j+1]. """ ## --NLTK-- if j is None and i == 0: word = p else: if j is None: j = len(p) - 1 word = p[i:j+1] if word in self.pool: return self.pool[word] if len(word) <= 2: return word # --DEPARTURE-- # With this line, strings of length 1 or 2 don't go through the # stemming process, although no mention is made of this in the # published algorithm. Remove the line to match the published # algorithm. word = self._step1ab(word) word = self._step1c(word) word = self._step2(word) word = self._step3(word) word = self._step4(word) word = self._step5(word) return word def _adjust_case(self, word, stem): lower = word.lower() ret = "" for x in range(len(stem)): if lower[x] == stem[x]: ret += word[x] else: ret += stem[x] return ret ## --NLTK-- ## Don't use this procedure; we want to work with individual ## tokens, instead. (commented out the following procedure) #def stem(self, text): # parts = re.split("(\W+)", text) # numWords = (len(parts) + 1)/2 # # ret = "" # for i in xrange(numWords): # word = parts[2 * i] # separator = "" # if ((2 * i) + 1) < len(parts): # separator = parts[(2 * i) + 1] # # stem = self.stem_word(string.lower(word), 0, len(word) - 1) # ret = ret + self.adjust_case(word, stem) # ret = ret + separator # return ret ## --NLTK-- ## Define a stem() method that implements the StemmerI interface. def stem(self, word): stem = self.stem_word(word.lower(), 0, len(word) - 1) return self._adjust_case(word, stem) ## --NLTK-- ## Add a string representation function def __repr__(self): return '' ## --NLTK-- ## This test procedure isn't applicable. #if __name__ == '__main__': # p = PorterStemmer() # if len(sys.argv) > 1: # for f in sys.argv[1:]: # with open(f, 'r') as infile: # while 1: # w = infile.readline() # if w == '': # break # w = w[:-1] # print(p.stem(w)) ##--NLTK-- ## Added a demo() function def demo(): """ A demonstration of the porter stemmer on a sample from the Penn Treebank corpus. """ from nltk.corpus import treebank from nltk import stem stemmer = stem.PorterStemmer() orig = [] stemmed = [] for item in treebank.files()[:3]: for (word, tag) in treebank.tagged_words(item): orig.append(word) stemmed.append(stemmer.stem(word)) # Convert the results to a string, and word-wrap them. results = ' '.join(stemmed) results = re.sub(r"(.{,70})\s", r'\1\n', results+' ').rstrip() # Convert the original to a string, and word wrap it. original = ' '.join(orig) original = re.sub(r"(.{,70})\s", r'\1\n', original+' ').rstrip() # Print the results. print('-Original-'.center(70).replace(' ', '*').replace('-', ' ')) print(original) print('-Results-'.center(70).replace(' ', '*').replace('-', ' ')) print(results) print('*'*70) ##--NLTK-- nltk-3.1/nltk/stem/regexp.py0000644000076500000240000000315012607224144015641 0ustar sbstaff00000000000000# Natural Language Toolkit: Stemmers # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals import re from nltk.stem.api import StemmerI from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class RegexpStemmer(StemmerI): """ A stemmer that uses regular expressions to identify morphological affixes. Any substrings that match the regular expressions will be removed. >>> from nltk.stem import RegexpStemmer >>> st = RegexpStemmer('ing$|s$|e$|able$', min=4) >>> st.stem('cars') 'car' >>> st.stem('mass') 'mas' >>> st.stem('was') 'was' >>> st.stem('bee') 'bee' >>> st.stem('compute') 'comput' >>> st.stem('advisable') 'advis' :type regexp: str or regexp :param regexp: The regular expression that should be used to identify morphological affixes. :type min: int :param min: The minimum length of string to stem """ def __init__(self, regexp, min=0): if not hasattr(regexp, 'pattern'): regexp = re.compile(regexp) self._regexp = regexp self._min = min def stem(self, word): if len(word) < self._min: return word else: return self._regexp.sub('', word) def __repr__(self): return '' % self._regexp.pattern nltk-3.1/nltk/stem/rslp.py0000644000076500000240000001252412607224144015334 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: RSLP Stemmer # # Copyright (C) 2001-2015 NLTK Project # Author: Tiago Tresoldi # URL: # For license information, see LICENSE.TXT # This code is based on the algorithm presented in the paper "A Stemming # Algorithm for the Portuguese Language" by Viviane Moreira Orengo and # Christian Huyck, which unfortunately I had no access to. The code is a # Python version, with some minor modifications of mine, to the description # presented at http://www.webcitation.org/5NnvdIzOb and to the C source code # available at http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. # Please note that this stemmer is intended for demonstration and educational # purposes only. Feel free to write me for any comments, including the # development of a different and/or better stemmer for Portuguese. I also # suggest using NLTK's mailing list for Portuguese for any discussion. # Este código é baseado no algoritmo apresentado no artigo "A Stemming # Algorithm for the Portuguese Language" de Viviane Moreira Orengo e # Christian Huyck, o qual infelizmente não tive a oportunidade de ler. O # código é uma conversão para Python, com algumas pequenas modificações # minhas, daquele apresentado em http://www.webcitation.org/5NnvdIzOb e do # código para linguagem C disponível em # http://www.inf.ufrgs.br/~arcoelho/rslp/integrando_rslp.html. Por favor, # lembre-se de que este stemmer foi desenvolvido com finalidades unicamente # de demonstração e didáticas. Sinta-se livre para me escrever para qualquer # comentário, inclusive sobre o desenvolvimento de um stemmer diferente # e/ou melhor para o português. Também sugiro utilizar-se a lista de discussão # do NLTK para o português para qualquer debate. from __future__ import print_function, unicode_literals from nltk.data import load from nltk.stem.api import StemmerI class RSLPStemmer(StemmerI): """ A stemmer for Portuguese. >>> from nltk.stem import RSLPStemmer >>> st = RSLPStemmer() >>> # opening lines of Erico Verissimo's "Música ao Longe" >>> text = ''' ... Clarissa risca com giz no quadro-negro a paisagem que os alunos ... devem copiar . Uma casinha de porta e janela , em cima duma ... coxilha .''' >>> for token in text.split(): ... print(st.stem(token)) clariss risc com giz no quadro-negr a pais que os alun dev copi . uma cas de port e janel , em cim dum coxilh . """ def __init__ (self): self._model = [] self._model.append( self.read_rule("step0.pt") ) self._model.append( self.read_rule("step1.pt") ) self._model.append( self.read_rule("step2.pt") ) self._model.append( self.read_rule("step3.pt") ) self._model.append( self.read_rule("step4.pt") ) self._model.append( self.read_rule("step5.pt") ) self._model.append( self.read_rule("step6.pt") ) def read_rule (self, filename): rules = load('nltk:stemmers/rslp/' + filename, format='raw').decode("utf8") lines = rules.split("\n") lines = [line for line in lines if line != ""] # remove blank lines lines = [line for line in lines if line[0] != "#"] # remove comments # NOTE: a simple but ugly hack to make this parser happy with double '\t's lines = [line.replace("\t\t", "\t") for line in lines] # parse rules rules = [] for line in lines: rule = [] tokens = line.split("\t") # text to be searched for at the end of the string rule.append( tokens[0][1:-1] ) # remove quotes # minimum stem size to perform the replacement rule.append( int(tokens[1]) ) # text to be replaced into rule.append( tokens[2][1:-1] ) # remove quotes # exceptions to this rule rule.append( [token[1:-1] for token in tokens[3].split(",")] ) # append to the results rules.append(rule) return rules def stem(self, word): word = word.lower() # the word ends in 's'? apply rule for plural reduction if word[-1] == "s": word = self.apply_rule(word, 0) # the word ends in 'a'? apply rule for feminine reduction if word[-1] == "a": word = self.apply_rule(word, 1) # augmentative reduction word = self.apply_rule(word, 3) # adverb reduction word = self.apply_rule(word, 2) # noun reduction prev_word = word word = self.apply_rule(word, 4) if word == prev_word: # verb reduction prev_word = word word = self.apply_rule(word, 5) if word == prev_word: # vowel removal word = self.apply_rule(word, 6) return word def apply_rule(self, word, rule_index): rules = self._model[rule_index] for rule in rules: suffix_length = len(rule[0]) if word[-suffix_length:] == rule[0]: # if suffix matches if len(word) >= suffix_length + rule[1]: # if we have minimum size if word not in rule[3]: # if not an exception word = word[:-suffix_length] + rule[2] break return word nltk-3.1/nltk/stem/snowball.py0000644000076500000240000043467612607224144016215 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # # Natural Language Toolkit: Snowball Stemmer # # Copyright (C) 2001-2015 NLTK Project # Author: Peter Michael Stahl # Peter Ljunglof (revisions) # Algorithms: Dr Martin Porter # URL: # For license information, see LICENSE.TXT """ Snowball stemmers This module provides a port of the Snowball stemmers developed by Martin Porter. There is also a demo function: `snowball.demo()`. """ from __future__ import unicode_literals, print_function from nltk import compat from nltk.corpus import stopwords from nltk.stem import porter from nltk.stem.util import suffix_replace from nltk.stem.api import StemmerI class SnowballStemmer(StemmerI): """ Snowball Stemmer The following languages are supported: Danish, Dutch, English, Finnish, French, German, Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian, Spanish and Swedish. The algorithm for English is documented here: Porter, M. \"An algorithm for suffix stripping.\" Program 14.3 (1980): 130-137. The algorithms have been developed by Martin Porter. These stemmers are called Snowball, because Porter created a programming language with this name for creating new stemming algorithms. There is more information available at http://snowball.tartarus.org/ The stemmer is invoked as shown below: >>> from nltk.stem import SnowballStemmer >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish >>> stemmer = SnowballStemmer("german") # Choose a language >>> stemmer.stem("Autobahnen") # Stem a word 'autobahn' Invoking the stemmers that way is useful if you do not know the language to be stemmed at runtime. Alternatively, if you already know the language, then you can invoke the language specific stemmer directly: >>> from nltk.stem.snowball import GermanStemmer >>> stemmer = GermanStemmer() >>> stemmer.stem("Autobahnen") 'autobahn' :param language: The language whose subclass is instantiated. :type language: str or unicode :param ignore_stopwords: If set to True, stopwords are not stemmed and returned unchanged. Set to False by default. :type ignore_stopwords: bool :raise ValueError: If there is no stemmer for the specified language, a ValueError is raised. """ languages = ("danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "porter", "portuguese", "romanian", "russian", "spanish", "swedish") def __init__(self, language, ignore_stopwords=False): if language not in self.languages: raise ValueError("The language '%s' is not supported." % language) stemmerclass = globals()[language.capitalize() + "Stemmer"] self.stemmer = stemmerclass(ignore_stopwords) self.stem = self.stemmer.stem self.stopwords = self.stemmer.stopwords @compat.python_2_unicode_compatible class _LanguageSpecificStemmer(StemmerI): """ This helper subclass offers the possibility to invoke a specific stemmer directly. This is useful if you already know the language to be stemmed at runtime. Create an instance of the Snowball stemmer. :param ignore_stopwords: If set to True, stopwords are not stemmed and returned unchanged. Set to False by default. :type ignore_stopwords: bool """ def __init__(self, ignore_stopwords=False): # The language is the name of the class, minus the final "Stemmer". language = type(self).__name__.lower() if language.endswith("stemmer"): language = language[:-7] self.stopwords = set() if ignore_stopwords: try: for word in stopwords.words(language): self.stopwords.add(word) except IOError: raise ValueError("%r has no list of stopwords. Please set" " 'ignore_stopwords' to 'False'." % self) def __repr__(self): """ Print out the string representation of the respective class. """ return "<%s>" % type(self).__name__ class PorterStemmer(_LanguageSpecificStemmer, porter.PorterStemmer): """ A word stemmer based on the original Porter stemming algorithm. Porter, M. \"An algorithm for suffix stripping.\" Program 14.3 (1980): 130-137. A few minor modifications have been made to Porter's basic algorithm. See the source code of the module nltk.stem.porter for more information. """ def __init__(self, ignore_stopwords=False): _LanguageSpecificStemmer.__init__(self, ignore_stopwords) porter.PorterStemmer.__init__(self) class _ScandinavianStemmer(_LanguageSpecificStemmer): """ This subclass encapsulates a method for defining the string region R1. It is used by the Danish, Norwegian, and Swedish stemmer. """ def _r1_scandinavian(self, word, vowels): """ Return the region R1 that is used by the Scandinavian stemmers. R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel. But then R1 is adjusted so that the region before it contains at least three letters. :param word: The word whose region R1 is determined. :type word: str or unicode :param vowels: The vowels of the respective language that are used to determine the region R1. :type vowels: unicode :return: the region R1 for the respective word. :rtype: unicode :note: This helper method is invoked by the respective stem method of the subclasses DanishStemmer, NorwegianStemmer, and SwedishStemmer. It is not to be invoked directly! """ r1 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i-1] in vowels: if len(word[:i+1]) < 3 and len(word[:i+1]) > 0: r1 = word[3:] elif len(word[:i+1]) >= 3: r1 = word[i+1:] else: return word break return r1 class _StandardStemmer(_LanguageSpecificStemmer): """ This subclass encapsulates two methods for defining the standard versions of the string regions R1, R2, and RV. """ def _r1r2_standard(self, word, vowels): """ Return the standard interpretations of the string regions R1 and R2. R1 is the region after the first non-vowel following a vowel, or is the null region at the end of the word if there is no such non-vowel. R2 is the region after the first non-vowel following a vowel in R1, or is the null region at the end of the word if there is no such non-vowel. :param word: The word whose regions R1 and R2 are determined. :type word: str or unicode :param vowels: The vowels of the respective language that are used to determine the regions R1 and R2. :type vowels: unicode :return: (r1,r2), the regions R1 and R2 for the respective word. :rtype: tuple :note: This helper method is invoked by the respective stem method of the subclasses DutchStemmer, FinnishStemmer, FrenchStemmer, GermanStemmer, ItalianStemmer, PortugueseStemmer, RomanianStemmer, and SpanishStemmer. It is not to be invoked directly! :note: A detailed description of how to define R1 and R2 can be found at http://snowball.tartarus.org/texts/r1r2.html """ r1 = "" r2 = "" for i in range(1, len(word)): if word[i] not in vowels and word[i-1] in vowels: r1 = word[i+1:] break for i in range(1, len(r1)): if r1[i] not in vowels and r1[i-1] in vowels: r2 = r1[i+1:] break return (r1, r2) def _rv_standard(self, word, vowels): """ Return the standard interpretation of the string region RV. If the second letter is a consonant, RV is the region after the next following vowel. If the first two letters are vowels, RV is the region after the next following consonant. Otherwise, RV is the region after the third letter. :param word: The word whose region RV is determined. :type word: str or unicode :param vowels: The vowels of the respective language that are used to determine the region RV. :type vowels: unicode :return: the region RV for the respective word. :rtype: unicode :note: This helper method is invoked by the respective stem method of the subclasses ItalianStemmer, PortugueseStemmer, RomanianStemmer, and SpanishStemmer. It is not to be invoked directly! """ rv = "" if len(word) >= 2: if word[1] not in vowels: for i in range(2, len(word)): if word[i] in vowels: rv = word[i+1:] break elif word[0] in vowels and word[1] in vowels: for i in range(2, len(word)): if word[i] not in vowels: rv = word[i+1:] break else: rv = word[3:] return rv class DanishStemmer(_ScandinavianStemmer): """ The Danish Snowball stemmer. :cvar __vowels: The Danish vowels. :type __vowels: unicode :cvar __consonants: The Danish consonants. :type __consonants: unicode :cvar __double_consonants: The Danish double consonants. :type __double_consonants: tuple :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Danish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/danish/stemmer.html """ # The language's vowels and other important characters are defined. __vowels = "aeiouy\xE6\xE5\xF8" __consonants = "bcdfghjklmnpqrstvwxz" __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj", "kk", "ll", "mm", "nn", "pp", "qq", "rr", "ss", "tt", "vv", "ww", "xx", "zz") __s_ending = "abcdfghjklmnoprtvyz\xE5" # The different suffixes, divided into the algorithm's steps # and organized by length, are listed in tuples. __step1_suffixes = ("erendes", "erende", "hedens", "ethed", "erede", "heden", "heder", "endes", "ernes", "erens", "erets", "ered", "ende", "erne", "eren", "erer", "heds", "enes", "eres", "eret", "hed", "ene", "ere", "ens", "ers", "ets", "en", "er", "es", "et", "e", "s") __step2_suffixes = ("gd", "dt", "gt", "kt") __step3_suffixes = ("elig", "l\xF8st", "lig", "els", "ig") def stem(self, word): """ Stem a Danish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ # Every word is put into lower case for normalization. word = word.lower() if word in self.stopwords: return word # After this, the required regions are generated # by the respective helper method. r1 = self._r1_scandinavian(word, self.__vowels) # Then the actual stemming process starts. # Every new step is explicitly indicated # according to the descriptions on the Snowball website. # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): word = word[:-1] r1 = r1[:-1] break # STEP 3 if r1.endswith("igst"): word = word[:-2] r1 = r1[:-2] for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix == "l\xF8st": word = word[:-1] r1 = r1[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] if r1.endswith(self.__step2_suffixes): word = word[:-1] r1 = r1[:-1] break # STEP 4: Undouble for double_cons in self.__double_consonants: if word.endswith(double_cons) and len(word) > 3: word = word[:-1] break return word class DutchStemmer(_StandardStemmer): """ The Dutch Snowball stemmer. :cvar __vowels: The Dutch vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step3b_suffixes: Suffixes to be deleted in step 3b of the algorithm. :type __step3b_suffixes: tuple :note: A detailed description of the Dutch stemming algorithm can be found under http://snowball.tartarus.org/algorithms/dutch/stemmer.html """ __vowels = "aeiouy\xE8" __step1_suffixes = ("heden", "ene", "en", "se", "s") __step3b_suffixes = ("baar", "lijk", "bar", "end", "ing", "ig") def stem(self, word): """ Stem a Dutch word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step2_success = False # Vowel accents are removed. word = (word.replace("\xE4", "a").replace("\xE1", "a") .replace("\xEB", "e").replace("\xE9", "e") .replace("\xED", "i").replace("\xEF", "i") .replace("\xF6", "o").replace("\xF3", "o") .replace("\xFC", "u").replace("\xFA", "u")) # An initial 'y', a 'y' after a vowel, # and an 'i' between self.__vowels is put into upper case. # As from now these are treated as consonants. if word.startswith("y"): word = "".join(("Y", word[1:])) for i in range(1, len(word)): if word[i-1] in self.__vowels and word[i] == "y": word = "".join((word[:i], "Y", word[i+1:])) for i in range(1, len(word)-1): if (word[i-1] in self.__vowels and word[i] == "i" and word[i+1] in self.__vowels): word = "".join((word[:i], "I", word[i+1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) # R1 is adjusted so that the region before it # contains at least 3 letters. for i in range(1, len(word)): if word[i] not in self.__vowels and word[i-1] in self.__vowels: if len(word[:i+1]) < 3 and len(word[:i+1]) > 0: r1 = word[3:] elif len(word[:i+1]) == 0: return word break # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "heden": word = suffix_replace(word, suffix, "heid") r1 = suffix_replace(r1, suffix, "heid") if r2.endswith("heden"): r2 = suffix_replace(r2, suffix, "heid") elif (suffix in ("ene", "en") and not word.endswith("heden") and word[-len(suffix)-1] not in self.__vowels and word[-len(suffix)-3:-len(suffix)] != "gem"): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] if word.endswith(("kk", "dd", "tt")): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif (suffix in ("se", "s") and word[-len(suffix)-1] not in self.__vowels and word[-len(suffix)-1] != "j"): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 2 if r1.endswith("e") and word[-2] not in self.__vowels: step2_success = True word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] if word.endswith(("kk", "dd", "tt")): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] # STEP 3a if r2.endswith("heid") and word[-5] != "c": word = word[:-4] r1 = r1[:-4] r2 = r2[:-4] if (r1.endswith("en") and word[-3] not in self.__vowels and word[-5:-2] != "gem"): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] if word.endswith(("kk", "dd", "tt")): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] # STEP 3b: Derivational suffixes for suffix in self.__step3b_suffixes: if r2.endswith(suffix): if suffix in ("end", "ing"): word = word[:-3] r2 = r2[:-3] if r2.endswith("ig") and word[-3] != "e": word = word[:-2] else: if word.endswith(("kk", "dd", "tt")): word = word[:-1] elif suffix == "ig" and word[-3] != "e": word = word[:-2] elif suffix == "lijk": word = word[:-4] r1 = r1[:-4] if r1.endswith("e") and word[-2] not in self.__vowels: word = word[:-1] if word.endswith(("kk", "dd", "tt")): word = word[:-1] elif suffix == "baar": word = word[:-4] elif suffix == "bar" and step2_success: word = word[:-3] break # STEP 4: Undouble vowel if len(word) >= 4: if word[-1] not in self.__vowels and word[-1] != "I": if word[-3:-1] in ("aa", "ee", "oo", "uu"): if word[-4] not in self.__vowels: word = "".join((word[:-3], word[-3], word[-1])) # All occurrences of 'I' and 'Y' are put back into lower case. word = word.replace("I", "i").replace("Y", "y") return word class EnglishStemmer(_StandardStemmer): """ The English Snowball stemmer. :cvar __vowels: The English vowels. :type __vowels: unicode :cvar __double_consonants: The English double consonants. :type __double_consonants: tuple :cvar __li_ending: Letters that may directly appear before a word final 'li'. :type __li_ending: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1a_suffixes: Suffixes to be deleted in step 1a of the algorithm. :type __step1a_suffixes: tuple :cvar __step1b_suffixes: Suffixes to be deleted in step 1b of the algorithm. :type __step1b_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. :type __step5_suffixes: tuple :cvar __special_words: A dictionary containing words which have to be stemmed specially. :type __special_words: dict :note: A detailed description of the English stemming algorithm can be found under http://snowball.tartarus.org/algorithms/english/stemmer.html """ __vowels = "aeiouy" __double_consonants = ("bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") __li_ending = "cdeghkmnrt" __step0_suffixes = ("'s'", "'s", "'") __step1a_suffixes = ("sses", "ied", "ies", "us", "ss", "s") __step1b_suffixes = ("eedly", "ingly", "edly", "eed", "ing", "ed") __step2_suffixes = ('ization', 'ational', 'fulness', 'ousness', 'iveness', 'tional', 'biliti', 'lessli', 'entli', 'ation', 'alism', 'aliti', 'ousli', 'iviti', 'fulli', 'enci', 'anci', 'abli', 'izer', 'ator', 'alli', 'bli', 'ogi', 'li') __step3_suffixes = ('ational', 'tional', 'alize', 'icate', 'iciti', 'ative', 'ical', 'ness', 'ful') __step4_suffixes = ('ement', 'ance', 'ence', 'able', 'ible', 'ment', 'ant', 'ent', 'ism', 'ate', 'iti', 'ous', 'ive', 'ize', 'ion', 'al', 'er', 'ic') __step5_suffixes = ("e", "l") __special_words = {"skis" : "ski", "skies" : "sky", "dying" : "die", "lying" : "lie", "tying" : "tie", "idly" : "idl", "gently" : "gentl", "ugly" : "ugli", "early" : "earli", "only" : "onli", "singly" : "singl", "sky" : "sky", "news" : "news", "howe" : "howe", "atlas" : "atlas", "cosmos" : "cosmos", "bias" : "bias", "andes" : "andes", "inning" : "inning", "innings" : "inning", "outing" : "outing", "outings" : "outing", "canning" : "canning", "cannings" : "canning", "herring" : "herring", "herrings" : "herring", "earring" : "earring", "earrings" : "earring", "proceed" : "proceed", "proceeds" : "proceed", "proceeded" : "proceed", "proceeding" : "proceed", "exceed" : "exceed", "exceeds" : "exceed", "exceeded" : "exceed", "exceeding" : "exceed", "succeed" : "succeed", "succeeds" : "succeed", "succeeded" : "succeed", "succeeding" : "succeed"} def stem(self, word): """ Stem an English word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords or len(word) <= 2: return word elif word in self.__special_words: return self.__special_words[word] # Map the different apostrophe characters to a single consistent one word = (word.replace("\u2019", "\x27") .replace("\u2018", "\x27") .replace("\u201B", "\x27")) if word.startswith("\x27"): word = word[1:] if word.startswith("y"): word = "".join(("Y", word[1:])) for i in range(1, len(word)): if word[i-1] in self.__vowels and word[i] == "y": word = "".join((word[:i], "Y", word[i+1:])) step1a_vowel_found = False step1b_vowel_found = False r1 = "" r2 = "" if word.startswith(("gener", "commun", "arsen")): if word.startswith(("gener", "arsen")): r1 = word[5:] else: r1 = word[6:] for i in range(1, len(r1)): if r1[i] not in self.__vowels and r1[i-1] in self.__vowels: r2 = r1[i+1:] break else: r1, r2 = self._r1r2_standard(word, self.__vowels) # STEP 0 for suffix in self.__step0_suffixes: if word.endswith(suffix): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 1a for suffix in self.__step1a_suffixes: if word.endswith(suffix): if suffix == "sses": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix in ("ied", "ies"): if len(word[:-len(suffix)]) > 1: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif suffix == "s": for letter in word[:-2]: if letter in self.__vowels: step1a_vowel_found = True break if step1a_vowel_found: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] break # STEP 1b for suffix in self.__step1b_suffixes: if word.endswith(suffix): if suffix in ("eed", "eedly"): if r1.endswith(suffix): word = suffix_replace(word, suffix, "ee") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ee") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ee") else: r2 = "" else: for letter in word[:-len(suffix)]: if letter in self.__vowels: step1b_vowel_found = True break if step1b_vowel_found: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] if word.endswith(("at", "bl", "iz")): word = "".join((word, "e")) r1 = "".join((r1, "e")) if len(word) > 5 or len(r1) >=3: r2 = "".join((r2, "e")) elif word.endswith(self.__double_consonants): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif ((r1 == "" and len(word) >= 3 and word[-1] not in self.__vowels and word[-1] not in "wxY" and word[-2] in self.__vowels and word[-3] not in self.__vowels) or (r1 == "" and len(word) == 2 and word[0] in self.__vowels and word[1] not in self.__vowels)): word = "".join((word, "e")) if len(r1) > 0: r1 = "".join((r1, "e")) if len(r2) > 0: r2 = "".join((r2, "e")) break # STEP 1c if len(word) > 2 and word[-1] in "yY" and word[-2] not in self.__vowels: word = "".join((word[:-1], "i")) if len(r1) >= 1: r1 = "".join((r1[:-1], "i")) else: r1 = "" if len(r2) >= 1: r2 = "".join((r2[:-1], "i")) else: r2 = "" # STEP 2 for suffix in self.__step2_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == "tional": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix in ("enci", "anci", "abli"): word = "".join((word[:-1], "e")) if len(r1) >= 1: r1 = "".join((r1[:-1], "e")) else: r1 = "" if len(r2) >= 1: r2 = "".join((r2[:-1], "e")) else: r2 = "" elif suffix == "entli": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix in ("izer", "ization"): word = suffix_replace(word, suffix, "ize") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ize") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ize") else: r2 = "" elif suffix in ("ational", "ation", "ator"): word = suffix_replace(word, suffix, "ate") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ate") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ate") else: r2 = "e" elif suffix in ("alism", "aliti", "alli"): word = suffix_replace(word, suffix, "al") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "al") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "al") else: r2 = "" elif suffix == "fulness": word = word[:-4] r1 = r1[:-4] r2 = r2[:-4] elif suffix in ("ousli", "ousness"): word = suffix_replace(word, suffix, "ous") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ous") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ous") else: r2 = "" elif suffix in ("iveness", "iviti"): word = suffix_replace(word, suffix, "ive") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ive") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ive") else: r2 = "e" elif suffix in ("biliti", "bli"): word = suffix_replace(word, suffix, "ble") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ble") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ble") else: r2 = "" elif suffix == "ogi" and word[-4] == "l": word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] elif suffix in ("fulli", "lessli"): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "li" and word[-3] in self.__li_ending: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] break # STEP 3 for suffix in self.__step3_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == "tional": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "ational": word = suffix_replace(word, suffix, "ate") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ate") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ate") else: r2 = "" elif suffix == "alize": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] elif suffix in ("icate", "iciti", "ical"): word = suffix_replace(word, suffix, "ic") if len(r1) >= len(suffix): r1 = suffix_replace(r1, suffix, "ic") else: r1 = "" if len(r2) >= len(suffix): r2 = suffix_replace(r2, suffix, "ic") else: r2 = "" elif suffix in ("ful", "ness"): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] elif suffix == "ative" and r2.endswith(suffix): word = word[:-5] r1 = r1[:-5] r2 = r2[:-5] break # STEP 4 for suffix in self.__step4_suffixes: if word.endswith(suffix): if r2.endswith(suffix): if suffix == "ion": if word[-4] in "st": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 5 if r2.endswith("l") and word[-2] == "l": word = word[:-1] elif r2.endswith("e"): word = word[:-1] elif r1.endswith("e"): if len(word) >= 4 and (word[-2] in self.__vowels or word[-2] in "wxY" or word[-3] not in self.__vowels or word[-4] in self.__vowels): word = word[:-1] word = word.replace("Y", "y") return word class FinnishStemmer(_StandardStemmer): """ The Finnish Snowball stemmer. :cvar __vowels: The Finnish vowels. :type __vowels: unicode :cvar __restricted_vowels: A subset of the Finnish vowels. :type __restricted_vowels: unicode :cvar __long_vowels: The Finnish vowels in their long forms. :type __long_vowels: tuple :cvar __consonants: The Finnish consonants. :type __consonants: unicode :cvar __double_consonants: The Finnish double consonants. :type __double_consonants: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the Finnish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/finnish/stemmer.html """ __vowels = "aeiouy\xE4\xF6" __restricted_vowels = "aeiou\xE4\xF6" __long_vowels = ("aa", "ee", "ii", "oo", "uu", "\xE4\xE4", "\xF6\xF6") __consonants = "bcdfghjklmnpqrstvwxz" __double_consonants = ("bb", "cc", "dd", "ff", "gg", "hh", "jj", "kk", "ll", "mm", "nn", "pp", "qq", "rr", "ss", "tt", "vv", "ww", "xx", "zz") __step1_suffixes = ('kaan', 'k\xE4\xE4n', 'sti', 'kin', 'han', 'h\xE4n', 'ko', 'k\xF6', 'pa', 'p\xE4') __step2_suffixes = ('nsa', 'ns\xE4', 'mme', 'nne', 'si', 'ni', 'an', '\xE4n', 'en') __step3_suffixes = ('siin', 'tten', 'seen', 'han', 'hen', 'hin', 'hon', 'h\xE4n', 'h\xF6n', 'den', 'tta', 'tt\xE4', 'ssa', 'ss\xE4', 'sta', 'st\xE4', 'lla', 'll\xE4', 'lta', 'lt\xE4', 'lle', 'ksi', 'ine', 'ta', 't\xE4', 'na', 'n\xE4', 'a', '\xE4', 'n') __step4_suffixes = ('impi', 'impa', 'imp\xE4', 'immi', 'imma', 'imm\xE4', 'mpi', 'mpa', 'mp\xE4', 'mmi', 'mma', 'mm\xE4', 'eja', 'ej\xE4') def stem(self, word): """ Stem a Finnish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step3_success = False r1, r2 = self._r1r2_standard(word, self.__vowels) # STEP 1: Particles etc. for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "sti": if suffix in r2: word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] else: if word[-len(suffix)-1] in "ntaeiouy\xE4\xF6": word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 2: Possessives for suffix in self.__step2_suffixes: if r1.endswith(suffix): if suffix == "si": if word[-3] != "k": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "ni": word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] if word.endswith("kse"): word = suffix_replace(word, "kse", "ksi") if r1.endswith("kse"): r1 = suffix_replace(r1, "kse", "ksi") if r2.endswith("kse"): r2 = suffix_replace(r2, "kse", "ksi") elif suffix == "an": if (word[-4:-2] in ("ta", "na") or word[-5:-2] in ("ssa", "sta", "lla", "lta")): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "\xE4n": if (word[-4:-2] in ("t\xE4", "n\xE4") or word[-5:-2] in ("ss\xE4", "st\xE4", "ll\xE4", "lt\xE4")): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] elif suffix == "en": if word[-5:-2] in ("lle", "ine"): word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] break # STEP 3: Cases for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix in ("han", "hen", "hin", "hon", "h\xE4n", "h\xF6n"): if ((suffix == "han" and word[-4] == "a") or (suffix == "hen" and word[-4] == "e") or (suffix == "hin" and word[-4] == "i") or (suffix == "hon" and word[-4] == "o") or (suffix == "h\xE4n" and word[-4] == "\xE4") or (suffix == "h\xF6n" and word[-4] == "\xF6")): word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] step3_success = True elif suffix in ("siin", "den", "tten"): if (word[-len(suffix)-1] == "i" and word[-len(suffix)-2] in self.__restricted_vowels): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] step3_success = True else: continue elif suffix == "seen": if word[-6:-4] in self.__long_vowels: word = word[:-4] r1 = r1[:-4] r2 = r2[:-4] step3_success = True else: continue elif suffix in ("a", "\xE4"): if word[-2] in self.__vowels and word[-3] in self.__consonants: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] step3_success = True elif suffix in ("tta", "tt\xE4"): if word[-4] == "e": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] step3_success = True elif suffix == "n": word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] step3_success = True if word[-2:] == "ie" or word[-2:] in self.__long_vowels: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] step3_success = True break # STEP 4: Other endings for suffix in self.__step4_suffixes: if r2.endswith(suffix): if suffix in ("mpi", "mpa", "mp\xE4", "mmi", "mma", "mm\xE4"): if word[-5:-3] != "po": word = word[:-3] r1 = r1[:-3] r2 = r2[:-3] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 5: Plurals if step3_success and len(r1) >= 1 and r1[-1] in "ij": word = word[:-1] r1 = r1[:-1] elif (not step3_success and len(r1) >= 2 and r1[-1] == "t" and r1[-2] in self.__vowels): word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] if r2.endswith("imma"): word = word[:-4] r1 = r1[:-4] elif r2.endswith("mma") and r2[-5:-3] != "po": word = word[:-3] r1 = r1[:-3] # STEP 6: Tidying up if r1[-2:] in self.__long_vowels: word = word[:-1] r1 = r1[:-1] if (len(r1) >= 2 and r1[-2] in self.__consonants and r1[-1] in "a\xE4ei"): word = word[:-1] r1 = r1[:-1] if r1.endswith(("oj", "uj")): word = word[:-1] r1 = r1[:-1] if r1.endswith("jo"): word = word[:-1] r1 = r1[:-1] # If the word ends with a double consonant # followed by zero or more vowels, the last consonant is removed. for i in range(1, len(word)): if word[-i] in self.__vowels: continue else: if i == 1: if word[-i-1:] in self.__double_consonants: word = word[:-1] else: if word[-i-1:-i+1] in self.__double_consonants: word = "".join((word[:-i], word[-i+1:])) break return word class FrenchStemmer(_StandardStemmer): """ The French Snowball stemmer. :cvar __vowels: The French vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. :type __step2a_suffixes: tuple :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. :type __step2b_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the French stemming algorithm can be found under http://snowball.tartarus.org/algorithms/french/stemmer.html """ __vowels = "aeiouy\xE2\xE0\xEB\xE9\xEA\xE8\xEF\xEE\xF4\xFB\xF9" __step1_suffixes = ('issements', 'issement', 'atrices', 'atrice', 'ateurs', 'ations', 'logies', 'usions', 'utions', 'ements', 'amment', 'emment', 'ances', 'iqUes', 'ismes', 'ables', 'istes', 'ateur', 'ation', 'logie', 'usion', 'ution', 'ences', 'ement', 'euses', 'ments', 'ance', 'iqUe', 'isme', 'able', 'iste', 'ence', 'it\xE9s', 'ives', 'eaux', 'euse', 'ment', 'eux', 'it\xE9', 'ive', 'ifs', 'aux', 'if') __step2a_suffixes = ('issaIent', 'issantes', 'iraIent', 'issante', 'issants', 'issions', 'irions', 'issais', 'issait', 'issant', 'issent', 'issiez', 'issons', 'irais', 'irait', 'irent', 'iriez', 'irons', 'iront', 'isses', 'issez', '\xEEmes', '\xEEtes', 'irai', 'iras', 'irez', 'isse', 'ies', 'ira', '\xEEt', 'ie', 'ir', 'is', 'it', 'i') __step2b_suffixes = ('eraIent', 'assions', 'erions', 'assent', 'assiez', '\xE8rent', 'erais', 'erait', 'eriez', 'erons', 'eront', 'aIent', 'antes', 'asses', 'ions', 'erai', 'eras', 'erez', '\xE2mes', '\xE2tes', 'ante', 'ants', 'asse', '\xE9es', 'era', 'iez', 'ais', 'ait', 'ant', '\xE9e', '\xE9s', 'er', 'ez', '\xE2t', 'ai', 'as', '\xE9', 'a') __step4_suffixes = ('i\xE8re', 'I\xE8re', 'ion', 'ier', 'Ier', 'e', '\xEB') def stem(self, word): """ Stem a French word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False rv_ending_found = False step2a_success = False step2b_success = False # Every occurrence of 'u' after 'q' is put into upper case. for i in range(1, len(word)): if word[i-1] == "q" and word[i] == "u": word = "".join((word[:i], "U", word[i+1:])) # Every occurrence of 'u' and 'i' # between vowels is put into upper case. # Every occurrence of 'y' preceded or # followed by a vowel is also put into upper case. for i in range(1, len(word)-1): if word[i-1] in self.__vowels and word[i+1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i+1:])) elif word[i] == "i": word = "".join((word[:i], "I", word[i+1:])) if word[i-1] in self.__vowels or word[i+1] in self.__vowels: if word[i] == "y": word = "".join((word[:i], "Y", word[i+1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self.__rv_french(word, self.__vowels) # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "eaux": word = word[:-1] step1_success = True elif suffix in ("euse", "euses"): if suffix in r2: word = word[:-len(suffix)] step1_success = True elif suffix in r1: word = suffix_replace(word, suffix, "eux") step1_success = True elif suffix in ("ement", "ements") and suffix in rv: word = word[:-len(suffix)] step1_success = True if word[-2:] == "iv" and "iv" in r2: word = word[:-2] if word[-2:] == "at" and "at" in r2: word = word[:-2] elif word[-3:] == "eus": if "eus" in r2: word = word[:-3] elif "eus" in r1: word = "".join((word[:-1], "x")) elif word[-3:] in ("abl", "iqU"): if "abl" in r2 or "iqU" in r2: word = word[:-3] elif word[-3:] in ("i\xE8r", "I\xE8r"): if "i\xE8r" in rv or "I\xE8r" in rv: word = "".join((word[:-3], "i")) elif suffix == "amment" and suffix in rv: word = suffix_replace(word, "amment", "ant") rv = suffix_replace(rv, "amment", "ant") rv_ending_found = True elif suffix == "emment" and suffix in rv: word = suffix_replace(word, "emment", "ent") rv_ending_found = True elif (suffix in ("ment", "ments") and suffix in rv and not rv.startswith(suffix) and rv[rv.rindex(suffix)-1] in self.__vowels): word = word[:-len(suffix)] rv = rv[:-len(suffix)] rv_ending_found = True elif suffix == "aux" and suffix in r1: word = "".join((word[:-2], "l")) step1_success = True elif (suffix in ("issement", "issements") and suffix in r1 and word[-len(suffix)-1] not in self.__vowels): word = word[:-len(suffix)] step1_success = True elif suffix in ("ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes") and suffix in r2: word = word[:-len(suffix)] step1_success = True elif suffix in ("atrice", "ateur", "ation", "atrices", "ateurs", "ations") and suffix in r2: word = word[:-len(suffix)] step1_success = True if word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) elif suffix in ("logie", "logies") and suffix in r2: word = suffix_replace(word, suffix, "log") step1_success = True elif (suffix in ("usion", "ution", "usions", "utions") and suffix in r2): word = suffix_replace(word, suffix, "u") step1_success = True elif suffix in ("ence", "ences") and suffix in r2: word = suffix_replace(word, suffix, "ent") step1_success = True elif suffix in ("it\xE9", "it\xE9s") and suffix in r2: word = word[:-len(suffix)] step1_success = True if word[-4:] == "abil": if "abil" in r2: word = word[:-4] else: word = "".join((word[:-2], "l")) elif word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) elif word[-2:] == "iv": if "iv" in r2: word = word[:-2] elif (suffix in ("if", "ive", "ifs", "ives") and suffix in r2): word = word[:-len(suffix)] step1_success = True if word[-2:] == "at" and "at" in r2: word = word[:-2] if word[-2:] == "ic": if "ic" in r2: word = word[:-2] else: word = "".join((word[:-2], "iqU")) break # STEP 2a: Verb suffixes beginning 'i' if not step1_success or rv_ending_found: for suffix in self.__step2a_suffixes: if word.endswith(suffix): if (suffix in rv and len(rv) > len(suffix) and rv[rv.rindex(suffix)-1] not in self.__vowels): word = word[:-len(suffix)] step2a_success = True break # STEP 2b: Other verb suffixes if not step2a_success: for suffix in self.__step2b_suffixes: if rv.endswith(suffix): if suffix == "ions" and "ions" in r2: word = word[:-4] step2b_success = True elif suffix in ('eraIent', 'erions', '\xE8rent', 'erais', 'erait', 'eriez', 'erons', 'eront', 'erai', 'eras', 'erez', '\xE9es', 'era', 'iez', '\xE9e', '\xE9s', 'er', 'ez', '\xE9'): word = word[:-len(suffix)] step2b_success = True elif suffix in ('assions', 'assent', 'assiez', 'aIent', 'antes', 'asses', '\xE2mes', '\xE2tes', 'ante', 'ants', 'asse', 'ais', 'ait', 'ant', '\xE2t', 'ai', 'as', 'a'): word = word[:-len(suffix)] rv = rv[:-len(suffix)] step2b_success = True if rv.endswith("e"): word = word[:-1] break # STEP 3 if step1_success or step2a_success or step2b_success: if word[-1] == "Y": word = "".join((word[:-1], "i")) elif word[-1] == "\xE7": word = "".join((word[:-1], "c")) # STEP 4: Residual suffixes else: if (len(word) >= 2 and word[-1] == "s" and word[-2] not in "aiou\xE8s"): word = word[:-1] for suffix in self.__step4_suffixes: if word.endswith(suffix): if suffix in rv: if (suffix == "ion" and suffix in r2 and rv[-4] in "st"): word = word[:-3] elif suffix in ("ier", "i\xE8re", "Ier", "I\xE8re"): word = suffix_replace(word, suffix, "i") elif suffix == "e": word = word[:-1] elif suffix == "\xEB" and word[-3:-1] == "gu": word = word[:-1] break # STEP 5: Undouble if word.endswith(("enn", "onn", "ett", "ell", "eill")): word = word[:-1] # STEP 6: Un-accent for i in range(1, len(word)): if word[-i] not in self.__vowels: i += 1 else: if i != 1 and word[-i] in ("\xE9", "\xE8"): word = "".join((word[:-i], "e", word[-i+1:])) break word = (word.replace("I", "i") .replace("U", "u") .replace("Y", "y")) return word def __rv_french(self, word, vowels): """ Return the region RV that is used by the French stemmer. If the word begins with two vowels, RV is the region after the third letter. Otherwise, it is the region after the first vowel not at the beginning of the word, or the end of the word if these positions cannot be found. (Exceptionally, u'par', u'col' or u'tap' at the beginning of a word is also taken to define RV as the region to their right.) :param word: The French word whose region RV is determined. :type word: str or unicode :param vowels: The French vowels that are used to determine the region RV. :type vowels: unicode :return: the region RV for the respective French word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass FrenchStemmer. It is not to be invoked directly! """ rv = "" if len(word) >= 2: if (word.startswith(("par", "col", "tap")) or (word[0] in vowels and word[1] in vowels)): rv = word[3:] else: for i in range(1, len(word)): if word[i] in vowels: rv = word[i+1:] break return rv class GermanStemmer(_StandardStemmer): """ The German Snowball stemmer. :cvar __vowels: The German vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __st_ending: Letter that may directly appear before a word final 'st'. :type __st_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the German stemming algorithm can be found under http://snowball.tartarus.org/algorithms/german/stemmer.html """ __vowels = "aeiouy\xE4\xF6\xFC" __s_ending = "bdfghklmnrt" __st_ending = "bdfghklmnt" __step1_suffixes = ("ern", "em", "er", "en", "es", "e", "s") __step2_suffixes = ("est", "en", "er", "st") __step3_suffixes = ("isch", "lich", "heit", "keit", "end", "ung", "ig", "ik") def stem(self, word): """ Stem a German word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word word = word.replace("\xDF", "ss") # Every occurrence of 'u' and 'y' # between vowels is put into upper case. for i in range(1, len(word)-1): if word[i-1] in self.__vowels and word[i+1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i+1:])) elif word[i] == "y": word = "".join((word[:i], "Y", word[i+1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) # R1 is adjusted so that the region before it # contains at least 3 letters. for i in range(1, len(word)): if word[i] not in self.__vowels and word[i-1] in self.__vowels: if len(word[:i+1]) < 3 and len(word[:i+1]) > 0: r1 = word[3:] elif len(word[:i+1]) == 0: return word break # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if (suffix in ("en", "es", "e") and word[-len(suffix)-4:-len(suffix)] == "niss"): word = word[:-len(suffix)-1] r1 = r1[:-len(suffix)-1] r2 = r2[:-len(suffix)-1] elif suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] r2 = r2[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): if suffix == "st": if word[-3] in self.__st_ending and len(word[:-3]) >= 3: word = word[:-2] r1 = r1[:-2] r2 = r2[:-2] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] break # STEP 3: Derivational suffixes for suffix in self.__step3_suffixes: if r2.endswith(suffix): if suffix in ("end", "ung"): if ("ig" in r2[-len(suffix)-2:-len(suffix)] and "e" not in r2[-len(suffix)-3:-len(suffix)-2]): word = word[:-len(suffix)-2] else: word = word[:-len(suffix)] elif (suffix in ("ig", "ik", "isch") and "e" not in r2[-len(suffix)-1:-len(suffix)]): word = word[:-len(suffix)] elif suffix in ("lich", "heit"): if ("er" in r1[-len(suffix)-2:-len(suffix)] or "en" in r1[-len(suffix)-2:-len(suffix)]): word = word[:-len(suffix)-2] else: word = word[:-len(suffix)] elif suffix == "keit": if "lich" in r2[-len(suffix)-4:-len(suffix)]: word = word[:-len(suffix)-4] elif "ig" in r2[-len(suffix)-2:-len(suffix)]: word = word[:-len(suffix)-2] else: word = word[:-len(suffix)] break # Umlaut accents are removed and # 'u' and 'y' are put back into lower case. word = (word.replace("\xE4", "a").replace("\xF6", "o") .replace("\xFC", "u").replace("U", "u") .replace("Y", "y")) return word class HungarianStemmer(_LanguageSpecificStemmer): """ The Hungarian Snowball stemmer. :cvar __vowels: The Hungarian vowels. :type __vowels: unicode :cvar __digraphs: The Hungarian digraphs. :type __digraphs: tuple :cvar __double_consonants: The Hungarian double consonants. :type __double_consonants: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :cvar __step5_suffixes: Suffixes to be deleted in step 5 of the algorithm. :type __step5_suffixes: tuple :cvar __step6_suffixes: Suffixes to be deleted in step 6 of the algorithm. :type __step6_suffixes: tuple :cvar __step7_suffixes: Suffixes to be deleted in step 7 of the algorithm. :type __step7_suffixes: tuple :cvar __step8_suffixes: Suffixes to be deleted in step 8 of the algorithm. :type __step8_suffixes: tuple :cvar __step9_suffixes: Suffixes to be deleted in step 9 of the algorithm. :type __step9_suffixes: tuple :note: A detailed description of the Hungarian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/hungarian/stemmer.html """ __vowels = "aeiou\xF6\xFC\xE1\xE9\xED\xF3\xF5\xFA\xFB" __digraphs = ("cs", "dz", "dzs", "gy", "ly", "ny", "ty", "zs") __double_consonants = ("bb", "cc", "ccs", "dd", "ff", "gg", "ggy", "jj", "kk", "ll", "lly", "mm", "nn", "nny", "pp", "rr", "ss", "ssz", "tt", "tty", "vv", "zz", "zzs") __step1_suffixes = ("al", "el") __step2_suffixes = ('k\xE9ppen', 'onk\xE9nt', 'enk\xE9nt', 'ank\xE9nt', 'k\xE9pp', 'k\xE9nt', 'ban', 'ben', 'nak', 'nek', 'val', 'vel', 't\xF3l', 't\xF5l', 'r\xF3l', 'r\xF5l', 'b\xF3l', 'b\xF5l', 'hoz', 'hez', 'h\xF6z', 'n\xE1l', 'n\xE9l', '\xE9rt', 'kor', 'ba', 'be', 'ra', 're', 'ig', 'at', 'et', 'ot', '\xF6t', 'ul', '\xFCl', 'v\xE1', 'v\xE9', 'en', 'on', 'an', '\xF6n', 'n', 't') __step3_suffixes = ("\xE1nk\xE9nt", "\xE1n", "\xE9n") __step4_suffixes = ('astul', 'est\xFCl', '\xE1stul', '\xE9st\xFCl', 'stul', 'st\xFCl') __step5_suffixes = ("\xE1", "\xE9") __step6_suffixes = ('ok\xE9', '\xF6k\xE9', 'ak\xE9', 'ek\xE9', '\xE1k\xE9', '\xE1\xE9i', '\xE9k\xE9', '\xE9\xE9i', 'k\xE9', '\xE9i', '\xE9\xE9', '\xE9') __step7_suffixes = ('\xE1juk', '\xE9j\xFCk', '\xFCnk', 'unk', 'juk', 'j\xFCk', '\xE1nk', '\xE9nk', 'nk', 'uk', '\xFCk', 'em', 'om', 'am', 'od', 'ed', 'ad', '\xF6d', 'ja', 'je', '\xE1m', '\xE1d', '\xE9m', '\xE9d', 'm', 'd', 'a', 'e', 'o', '\xE1', '\xE9') __step8_suffixes = ('jaitok', 'jeitek', 'jaink', 'jeink', 'aitok', 'eitek', '\xE1itok', '\xE9itek', 'jaim', 'jeim', 'jaid', 'jeid', 'eink', 'aink', 'itek', 'jeik', 'jaik', '\xE1ink', '\xE9ink', 'aim', 'eim', 'aid', 'eid', 'jai', 'jei', 'ink', 'aik', 'eik', '\xE1im', '\xE1id', '\xE1ik', '\xE9im', '\xE9id', '\xE9ik', 'im', 'id', 'ai', 'ei', 'ik', '\xE1i', '\xE9i', 'i') __step9_suffixes = ("\xE1k", "\xE9k", "\xF6k", "ok", "ek", "ak", "k") def stem(self, word): """ Stem an Hungarian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word r1 = self.__r1_hungarian(word, self.__vowels, self.__digraphs) # STEP 1: Remove instrumental case if r1.endswith(self.__step1_suffixes): for double_cons in self.__double_consonants: if word[-2-len(double_cons):-2] == double_cons: word = "".join((word[:-4], word[-3])) if r1[-2-len(double_cons):-2] == double_cons: r1 = "".join((r1[:-4], r1[-3])) break # STEP 2: Remove frequent cases for suffix in self.__step2_suffixes: if word.endswith(suffix): if r1.endswith(suffix): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] if r1.endswith("\xE1"): word = "".join((word[:-1], "a")) r1 = suffix_replace(r1, "\xE1", "a") elif r1.endswith("\xE9"): word = "".join((word[:-1], "e")) r1 = suffix_replace(r1, "\xE9", "e") break # STEP 3: Remove special cases for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix == "\xE9n": word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") break # STEP 4: Remove other cases for suffix in self.__step4_suffixes: if r1.endswith(suffix): if suffix == "\xE1stul": word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") elif suffix == "\xE9st\xFCl": word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 5: Remove factive case for suffix in self.__step5_suffixes: if r1.endswith(suffix): for double_cons in self.__double_consonants: if word[-1-len(double_cons):-1] == double_cons: word = "".join((word[:-3], word[-2])) if r1[-1-len(double_cons):-1] == double_cons: r1 = "".join((r1[:-3], r1[-2])) break # STEP 6: Remove owned for suffix in self.__step6_suffixes: if r1.endswith(suffix): if suffix in ("\xE1k\xE9", "\xE1\xE9i"): word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") elif suffix in ("\xE9k\xE9", "\xE9\xE9i", "\xE9\xE9"): word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 7: Remove singular owner suffixes for suffix in self.__step7_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix in ("\xE1nk", "\xE1juk", "\xE1m", "\xE1d", "\xE1"): word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") elif suffix in ("\xE9nk", "\xE9j\xFCk", "\xE9m", "\xE9d", "\xE9"): word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 8: Remove plural owner suffixes for suffix in self.__step8_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix in ("\xE1im", "\xE1id", "\xE1i", "\xE1ink", "\xE1itok", "\xE1ik"): word = suffix_replace(word, suffix, "a") r1 = suffix_replace(r1, suffix, "a") elif suffix in ("\xE9im", "\xE9id", "\xE9i", "\xE9ink", "\xE9itek", "\xE9ik"): word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 9: Remove plural suffixes for suffix in self.__step9_suffixes: if word.endswith(suffix): if r1.endswith(suffix): if suffix == "\xE1k": word = suffix_replace(word, suffix, "a") elif suffix == "\xE9k": word = suffix_replace(word, suffix, "e") else: word = word[:-len(suffix)] break return word def __r1_hungarian(self, word, vowels, digraphs): """ Return the region R1 that is used by the Hungarian stemmer. If the word begins with a vowel, R1 is defined as the region after the first consonant or digraph (= two letters stand for one phoneme) in the word. If the word begins with a consonant, it is defined as the region after the first vowel in the word. If the word does not contain both a vowel and consonant, R1 is the null region at the end of the word. :param word: The Hungarian word whose region R1 is determined. :type word: str or unicode :param vowels: The Hungarian vowels that are used to determine the region R1. :type vowels: unicode :param digraphs: The digraphs that are used to determine the region R1. :type digraphs: tuple :return: the region R1 for the respective word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass HungarianStemmer. It is not to be invoked directly! """ r1 = "" if word[0] in vowels: for digraph in digraphs: if digraph in word[1:]: r1 = word[word.index(digraph[-1])+1:] return r1 for i in range(1, len(word)): if word[i] not in vowels: r1 = word[i+1:] break else: for i in range(1, len(word)): if word[i] in vowels: r1 = word[i+1:] break return r1 class ItalianStemmer(_StandardStemmer): """ The Italian Snowball stemmer. :cvar __vowels: The Italian vowels. :type __vowels: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :note: A detailed description of the Italian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/italian/stemmer.html """ __vowels = "aeiou\xE0\xE8\xEC\xF2\xF9" __step0_suffixes = ('gliela', 'gliele', 'glieli', 'glielo', 'gliene', 'sene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', 'teli', 'telo', 'tene', 'cela', 'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene', 'gli', 'ci', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi') __step1_suffixes = ('atrice', 'atrici', 'azione', 'azioni', 'uzione', 'uzioni', 'usione', 'usioni', 'amento', 'amenti', 'imento', 'imenti', 'amente', 'abile', 'abili', 'ibile', 'ibili', 'mente', 'atore', 'atori', 'logia', 'logie', 'anza', 'anze', 'iche', 'ichi', 'ismo', 'ismi', 'ista', 'iste', 'isti', 'ist\xE0', 'ist\xE8', 'ist\xEC', 'ante', 'anti', 'enza', 'enze', 'ico', 'ici', 'ica', 'ice', 'oso', 'osi', 'osa', 'ose', 'it\xE0', 'ivo', 'ivi', 'iva', 'ive') __step2_suffixes = ('erebbero', 'irebbero', 'assero', 'assimo', 'eranno', 'erebbe', 'eremmo', 'ereste', 'eresti', 'essero', 'iranno', 'irebbe', 'iremmo', 'ireste', 'iresti', 'iscano', 'iscono', 'issero', 'arono', 'avamo', 'avano', 'avate', 'eremo', 'erete', 'erono', 'evamo', 'evano', 'evate', 'iremo', 'irete', 'irono', 'ivamo', 'ivano', 'ivate', 'ammo', 'ando', 'asse', 'assi', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erai', 'erei', 'Yamo', 'iamo', 'immo', 'irai', 'irei', 'isca', 'isce', 'isci', 'isco', 'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'er\xE0', 'ere', 'er\xF2', 'ete', 'eva', 'evi', 'evo', 'ir\xE0', 'ire', 'ir\xF2', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'ar', 'ir') def stem(self, word): """ Stem an Italian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False # All acute accents are replaced by grave accents. word = (word.replace("\xE1", "\xE0") .replace("\xE9", "\xE8") .replace("\xED", "\xEC") .replace("\xF3", "\xF2") .replace("\xFA", "\xF9")) # Every occurrence of 'u' after 'q' # is put into upper case. for i in range(1, len(word)): if word[i-1] == "q" and word[i] == "u": word = "".join((word[:i], "U", word[i+1:])) # Every occurrence of 'u' and 'i' # between vowels is put into upper case. for i in range(1, len(word)-1): if word[i-1] in self.__vowels and word[i+1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i+1:])) elif word [i] == "i": word = "".join((word[:i], "I", word[i+1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 0: Attached pronoun for suffix in self.__step0_suffixes: if rv.endswith(suffix): if rv[-len(suffix)-4:-len(suffix)] in ("ando", "endo"): word = word[:-len(suffix)] r1 = r1[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] elif (rv[-len(suffix)-2:-len(suffix)] in ("ar", "er", "ir")): word = suffix_replace(word, suffix, "e") r1 = suffix_replace(r1, suffix, "e") r2 = suffix_replace(r2, suffix, "e") rv = suffix_replace(rv, suffix, "e") break # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "amente" and r1.endswith(suffix): step1_success = True word = word[:-6] r2 = r2[:-6] rv = rv[:-6] if r2.endswith("iv"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] elif r2.endswith(("os", "ic")): word = word[:-2] rv = rv[:-2] elif r2 .endswith("abil"): word = word[:-4] rv = rv[:-4] elif (suffix in ("amento", "amenti", "imento", "imenti") and rv.endswith(suffix)): step1_success = True word = word[:-6] rv = rv[:-6] elif r2.endswith(suffix): step1_success = True if suffix in ("azione", "azioni", "atore", "atori"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith("ic"): word = word[:-2] rv = rv[:-2] elif suffix in ("logia", "logie"): word = word[:-2] rv = word[:-2] elif suffix in ("uzione", "uzioni", "usione", "usioni"): word = word[:-5] rv = rv[:-5] elif suffix in ("enza", "enze"): word = suffix_replace(word, suffix, "te") rv = suffix_replace(rv, suffix, "te") elif suffix == "it\xE0": word = word[:-3] r2 = r2[:-3] rv = rv[:-3] if r2.endswith(("ic", "iv")): word = word[:-2] rv = rv[:-2] elif r2.endswith("abil"): word = word[:-4] rv = rv[:-4] elif suffix in ("ivo", "ivi", "iva", "ive"): word = word[:-3] r2 = r2[:-3] rv = rv[:-3] if r2.endswith("at"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("ic"): word = word[:-2] rv = rv[:-2] else: word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 2: Verb suffixes if not step1_success: for suffix in self.__step2_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 3a if rv.endswith(("a", "e", "i", "o", "\xE0", "\xE8", "\xEC", "\xF2")): word = word[:-1] rv = rv[:-1] if rv.endswith("i"): word = word[:-1] rv = rv[:-1] # STEP 3b if rv.endswith(("ch", "gh")): word = word[:-1] word = word.replace("I", "i").replace("U", "u") return word class NorwegianStemmer(_ScandinavianStemmer): """ The Norwegian Snowball stemmer. :cvar __vowels: The Norwegian vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Norwegian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/norwegian/stemmer.html """ __vowels = "aeiouy\xE6\xE5\xF8" __s_ending = "bcdfghjlmnoprtvyz" __step1_suffixes = ("hetenes", "hetene", "hetens", "heter", "heten", "endes", "ande", "ende", "edes", "enes", "erte", "ede", "ane", "ene", "ens", "ers", "ets", "het", "ast", "ert", "en", "ar", "er", "as", "es", "et", "a", "e", "s") __step2_suffixes = ("dt", "vt") __step3_suffixes = ("hetslov", "eleg", "elig", "elov", "slov", "leg", "eig", "lig", "els", "lov", "ig") def stem(self, word): """ Stem a Norwegian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word r1 = self._r1_scandinavian(word, self.__vowels) # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix in ("erte", "ert"): word = suffix_replace(word, suffix, "er") r1 = suffix_replace(r1, suffix, "er") elif suffix == "s": if (word[-2] in self.__s_ending or (word[-2] == "k" and word[-3] not in self.__vowels)): word = word[:-1] r1 = r1[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): word = word[:-1] r1 = r1[:-1] break # STEP 3 for suffix in self.__step3_suffixes: if r1.endswith(suffix): word = word[:-len(suffix)] break return word class PortugueseStemmer(_StandardStemmer): """ The Portuguese Snowball stemmer. :cvar __vowels: The Portuguese vowels. :type __vowels: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step4_suffixes: Suffixes to be deleted in step 4 of the algorithm. :type __step4_suffixes: tuple :note: A detailed description of the Portuguese stemming algorithm can be found under http://snowball.tartarus.org/algorithms/portuguese/stemmer.html """ __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xE2\xEA\xF4" __step1_suffixes = ('amentos', 'imentos', 'uço~es', 'amento', 'imento', 'adoras', 'adores', 'a\xE7o~es', 'logias', '\xEAncias', 'amente', 'idades', 'an\xE7as', 'ismos', 'istas', 'adora', 'a\xE7a~o', 'antes', '\xE2ncia', 'logia', 'uça~o', '\xEAncia', 'mente', 'idade', 'an\xE7a', 'ezas', 'icos', 'icas', 'ismo', '\xE1vel', '\xEDvel', 'ista', 'osos', 'osas', 'ador', 'ante', 'ivas', 'ivos', 'iras', 'eza', 'ico', 'ica', 'oso', 'osa', 'iva', 'ivo', 'ira') __step2_suffixes = ('ar\xEDamos', 'er\xEDamos', 'ir\xEDamos', '\xE1ssemos', '\xEAssemos', '\xEDssemos', 'ar\xEDeis', 'er\xEDeis', 'ir\xEDeis', '\xE1sseis', '\xE9sseis', '\xEDsseis', '\xE1ramos', '\xE9ramos', '\xEDramos', '\xE1vamos', 'aremos', 'eremos', 'iremos', 'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'ara~o', 'era~o', 'ira~o', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes', 'asses', 'esses', 'isses', 'astes', 'estes', 'istes', '\xE1reis', 'areis', '\xE9reis', 'ereis', '\xEDreis', 'ireis', '\xE1veis', '\xEDamos', 'armos', 'ermos', 'irmos', 'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'adas', 'idas', 'ar\xE1s', 'aras', 'er\xE1s', 'eras', 'ir\xE1s', 'avas', 'ares', 'eres', 'ires', '\xEDeis', 'ados', 'idos', '\xE1mos', 'amos', 'emos', 'imos', 'iras', 'ada', 'ida', 'ar\xE1', 'ara', 'er\xE1', 'era', 'ir\xE1', 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira', 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou') __step4_suffixes = ("os", "a", "i", "o", "\xE1", "\xED", "\xF3") def stem(self, word): """ Stem a Portuguese word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False step2_success = False word = (word.replace("\xE3", "a~") .replace("\xF5", "o~") .replace("q\xFC", "qu") .replace("g\xFC", "gu")) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix == "amente" and r1.endswith(suffix): step1_success = True word = word[:-6] r2 = r2[:-6] rv = rv[:-6] if r2.endswith("iv"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] elif r2.endswith(("os", "ic", "ad")): word = word[:-2] rv = rv[:-2] elif (suffix in ("ira", "iras") and rv.endswith(suffix) and word[-len(suffix)-1:-len(suffix)] == "e"): step1_success = True word = suffix_replace(word, suffix, "ir") rv = suffix_replace(rv, suffix, "ir") elif r2.endswith(suffix): step1_success = True if suffix in ("logia", "logias"): word = suffix_replace(word, suffix, "log") rv = suffix_replace(rv, suffix, "log") elif suffix in ("uça~o", "uço~es"): word = suffix_replace(word, suffix, "u") rv = suffix_replace(rv, suffix, "u") elif suffix in ("\xEAncia", "\xEAncias"): word = suffix_replace(word, suffix, "ente") rv = suffix_replace(rv, suffix, "ente") elif suffix == "mente": word = word[:-5] r2 = r2[:-5] rv = rv[:-5] if r2.endswith(("ante", "avel", "ivel")): word = word[:-4] rv = rv[:-4] elif suffix in ("idade", "idades"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith(("ic", "iv")): word = word[:-2] rv = rv[:-2] elif r2.endswith("abil"): word = word[:-4] rv = rv[:-4] elif suffix in ("iva", "ivo", "ivas", "ivos"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] else: word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 2: Verb suffixes if not step1_success: for suffix in self.__step2_suffixes: if rv.endswith(suffix): step2_success = True word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 3 if step1_success or step2_success: if rv.endswith("i") and word[-2] == "c": word = word[:-1] rv = rv[:-1] ### STEP 4: Residual suffix if not step1_success and not step2_success: for suffix in self.__step4_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 5 if rv.endswith(("e", "\xE9", "\xEA")): word = word[:-1] rv = rv[:-1] if ((word.endswith("gu") and rv.endswith("u")) or (word.endswith("ci") and rv.endswith("i"))): word = word[:-1] elif word.endswith("\xE7"): word = suffix_replace(word, "\xE7", "c") word = word.replace("a~", "\xE3").replace("o~", "\xF5") return word class RomanianStemmer(_StandardStemmer): """ The Romanian Snowball stemmer. :cvar __vowels: The Romanian vowels. :type __vowels: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Romanian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/romanian/stemmer.html """ __vowels = "aeiou\u0103\xE2\xEE" __step0_suffixes = ('iilor', 'ului', 'elor', 'iile', 'ilor', 'atei', 'a\u0163ie', 'a\u0163ia', 'aua', 'ele', 'iua', 'iei', 'ile', 'ul', 'ea', 'ii') __step1_suffixes = ('abilitate', 'abilitati', 'abilit\u0103\u0163i', 'ibilitate', 'abilit\u0103i', 'ivitate', 'ivitati', 'ivit\u0103\u0163i', 'icitate', 'icitati', 'icit\u0103\u0163i', 'icatori', 'ivit\u0103i', 'icit\u0103i', 'icator', 'a\u0163iune', 'atoare', '\u0103toare', 'i\u0163iune', 'itoare', 'iciva', 'icive', 'icivi', 'iciv\u0103', 'icala', 'icale', 'icali', 'ical\u0103', 'ativa', 'ative', 'ativi', 'ativ\u0103', 'atori', '\u0103tori', 'itiva', 'itive', 'itivi', 'itiv\u0103', 'itori', 'iciv', 'ical', 'ativ', 'ator', '\u0103tor', 'itiv', 'itor') __step2_suffixes = ('abila', 'abile', 'abili', 'abil\u0103', 'ibila', 'ibile', 'ibili', 'ibil\u0103', 'atori', 'itate', 'itati', 'it\u0103\u0163i', 'abil', 'ibil', 'oasa', 'oas\u0103', 'oase', 'anta', 'ante', 'anti', 'ant\u0103', 'ator', 'it\u0103i', 'iune', 'iuni', 'isme', 'ista', 'iste', 'isti', 'ist\u0103', 'i\u015Fti', 'ata', 'at\u0103', 'ati', 'ate', 'uta', 'ut\u0103', 'uti', 'ute', 'ita', 'it\u0103', 'iti', 'ite', 'ica', 'ice', 'ici', 'ic\u0103', 'osi', 'o\u015Fi', 'ant', 'iva', 'ive', 'ivi', 'iv\u0103', 'ism', 'ist', 'at', 'ut', 'it', 'ic', 'os', 'iv') __step3_suffixes = ('seser\u0103\u0163i', 'aser\u0103\u0163i', 'iser\u0103\u0163i', '\xE2ser\u0103\u0163i', 'user\u0103\u0163i', 'seser\u0103m', 'aser\u0103m', 'iser\u0103m', '\xE2ser\u0103m', 'user\u0103m', 'ser\u0103\u0163i', 'sese\u015Fi', 'seser\u0103', 'easc\u0103', 'ar\u0103\u0163i', 'ur\u0103\u0163i', 'ir\u0103\u0163i', '\xE2r\u0103\u0163i', 'ase\u015Fi', 'aser\u0103', 'ise\u015Fi', 'iser\u0103', '\xe2se\u015Fi', '\xE2ser\u0103', 'use\u015Fi', 'user\u0103', 'ser\u0103m', 'sesem', 'indu', '\xE2ndu', 'eaz\u0103', 'e\u015Fti', 'e\u015Fte', '\u0103\u015Fti', '\u0103\u015Fte', 'ea\u0163i', 'ia\u0163i', 'ar\u0103m', 'ur\u0103m', 'ir\u0103m', '\xE2r\u0103m', 'asem', 'isem', '\xE2sem', 'usem', 'se\u015Fi', 'ser\u0103', 'sese', 'are', 'ere', 'ire', '\xE2re', 'ind', '\xE2nd', 'eze', 'ezi', 'esc', '\u0103sc', 'eam', 'eai', 'eau', 'iam', 'iai', 'iau', 'a\u015Fi', 'ar\u0103', 'u\u015Fi', 'ur\u0103', 'i\u015Fi', 'ir\u0103', '\xE2\u015Fi', '\xe2r\u0103', 'ase', 'ise', '\xE2se', 'use', 'a\u0163i', 'e\u0163i', 'i\u0163i', '\xe2\u0163i', 'sei', 'ez', 'am', 'ai', 'au', 'ea', 'ia', 'ui', '\xE2i', '\u0103m', 'em', 'im', '\xE2m', 'se') def stem(self, word): """ Stem a Romanian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False step2_success = False for i in range(1, len(word)-1): if word[i-1] in self.__vowels and word[i+1] in self.__vowels: if word[i] == "u": word = "".join((word[:i], "U", word[i+1:])) elif word[i] == "i": word = "".join((word[:i], "I", word[i+1:])) r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 0: Removal of plurals and other simplifications for suffix in self.__step0_suffixes: if word.endswith(suffix): if suffix in r1: if suffix in ("ul", "ului"): word = word[:-len(suffix)] if suffix in rv: rv = rv[:-len(suffix)] else: rv = "" elif (suffix == "aua" or suffix == "atei" or (suffix == "ile" and word[-5:-3] != "ab")): word = word[:-2] elif suffix in ("ea", "ele", "elor"): word = suffix_replace(word, suffix, "e") if suffix in rv: rv = suffix_replace(rv, suffix, "e") else: rv = "" elif suffix in ("ii", "iua", "iei", "iile", "iilor", "ilor"): word = suffix_replace(word, suffix, "i") if suffix in rv: rv = suffix_replace(rv, suffix, "i") else: rv = "" elif suffix in ("a\u0163ie", "a\u0163ia"): word = word[:-1] break # STEP 1: Reduction of combining suffixes while True: replacement_done = False for suffix in self.__step1_suffixes: if word.endswith(suffix): if suffix in r1: step1_success = True replacement_done = True if suffix in ("abilitate", "abilitati", "abilit\u0103i", "abilit\u0103\u0163i"): word = suffix_replace(word, suffix, "abil") elif suffix == "ibilitate": word = word[:-5] elif suffix in ("ivitate", "ivitati", "ivit\u0103i", "ivit\u0103\u0163i"): word = suffix_replace(word, suffix, "iv") elif suffix in ("icitate", "icitati", "icit\u0103i", "icit\u0103\u0163i", "icator", "icatori", "iciv", "iciva", "icive", "icivi", "iciv\u0103", "ical", "icala", "icale", "icali", "ical\u0103"): word = suffix_replace(word, suffix, "ic") elif suffix in ("ativ", "ativa", "ative", "ativi", "ativ\u0103", "a\u0163iune", "atoare", "ator", "atori", "\u0103toare", "\u0103tor", "\u0103tori"): word = suffix_replace(word, suffix, "at") if suffix in r2: r2 = suffix_replace(r2, suffix, "at") elif suffix in ("itiv", "itiva", "itive", "itivi", "itiv\u0103", "i\u0163iune", "itoare", "itor", "itori"): word = suffix_replace(word, suffix, "it") if suffix in r2: r2 = suffix_replace(r2, suffix, "it") else: step1_success = False break if not replacement_done: break # STEP 2: Removal of standard suffixes for suffix in self.__step2_suffixes: if word.endswith(suffix): if suffix in r2: step2_success = True if suffix in ("iune", "iuni"): if word[-5] == "\u0163": word = "".join((word[:-5], "t")) elif suffix in ("ism", "isme", "ist", "ista", "iste", "isti", "ist\u0103", "i\u015Fti"): word = suffix_replace(word, suffix, "ist") else: word = word[:-len(suffix)] break # STEP 3: Removal of verb suffixes if not step1_success and not step2_success: for suffix in self.__step3_suffixes: if word.endswith(suffix): if suffix in rv: if suffix in ('seser\u0103\u0163i', 'seser\u0103m', 'ser\u0103\u0163i', 'sese\u015Fi', 'seser\u0103', 'ser\u0103m', 'sesem', 'se\u015Fi', 'ser\u0103', 'sese', 'a\u0163i', 'e\u0163i', 'i\u0163i', '\xE2\u0163i', 'sei', '\u0103m', 'em', 'im', '\xE2m', 'se'): word = word[:-len(suffix)] rv = rv[:-len(suffix)] else: if (not rv.startswith(suffix) and rv[rv.index(suffix)-1] not in "aeio\u0103\xE2\xEE"): word = word[:-len(suffix)] break # STEP 4: Removal of final vowel for suffix in ("ie", "a", "e", "i", "\u0103"): if word.endswith(suffix): if suffix in rv: word = word[:-len(suffix)] break word = word.replace("I", "i").replace("U", "u") return word class RussianStemmer(_LanguageSpecificStemmer): """ The Russian Snowball stemmer. :cvar __perfective_gerund_suffixes: Suffixes to be deleted. :type __perfective_gerund_suffixes: tuple :cvar __adjectival_suffixes: Suffixes to be deleted. :type __adjectival_suffixes: tuple :cvar __reflexive_suffixes: Suffixes to be deleted. :type __reflexive_suffixes: tuple :cvar __verb_suffixes: Suffixes to be deleted. :type __verb_suffixes: tuple :cvar __noun_suffixes: Suffixes to be deleted. :type __noun_suffixes: tuple :cvar __superlative_suffixes: Suffixes to be deleted. :type __superlative_suffixes: tuple :cvar __derivational_suffixes: Suffixes to be deleted. :type __derivational_suffixes: tuple :note: A detailed description of the Russian stemming algorithm can be found under http://snowball.tartarus.org/algorithms/russian/stemmer.html """ __perfective_gerund_suffixes = ("ivshis'", "yvshis'", "vshis'", "ivshi", "yvshi", "vshi", "iv", "yv", "v") __adjectival_suffixes = ('ui^ushchi^ui^u', 'ui^ushchi^ai^a', 'ui^ushchimi', 'ui^ushchymi', 'ui^ushchego', 'ui^ushchogo', 'ui^ushchemu', 'ui^ushchomu', 'ui^ushchikh', 'ui^ushchykh', 'ui^ushchui^u', 'ui^ushchaia', 'ui^ushchoi^u', 'ui^ushchei^u', 'i^ushchi^ui^u', 'i^ushchi^ai^a', 'ui^ushchee', 'ui^ushchie', 'ui^ushchye', 'ui^ushchoe', 'ui^ushchei`', 'ui^ushchii`', 'ui^ushchyi`', 'ui^ushchoi`', 'ui^ushchem', 'ui^ushchim', 'ui^ushchym', 'ui^ushchom', 'i^ushchimi', 'i^ushchymi', 'i^ushchego', 'i^ushchogo', 'i^ushchemu', 'i^ushchomu', 'i^ushchikh', 'i^ushchykh', 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchee', 'i^ushchie', 'i^ushchye', 'i^ushchoe', 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem', 'i^ushchim', 'i^ushchym', 'i^ushchom', 'shchi^ui^u', 'shchi^ai^a', 'ivshi^ui^u', 'ivshi^ai^a', 'yvshi^ui^u', 'yvshi^ai^a', 'shchimi', 'shchymi', 'shchego', 'shchogo', 'shchemu', 'shchomu', 'shchikh', 'shchykh', 'shchui^u', 'shchai^a', 'shchoi^u', 'shchei^u', 'ivshimi', 'ivshymi', 'ivshego', 'ivshogo', 'ivshemu', 'ivshomu', 'ivshikh', 'ivshykh', 'ivshui^u', 'ivshai^a', 'ivshoi^u', 'ivshei^u', 'yvshimi', 'yvshymi', 'yvshego', 'yvshogo', 'yvshemu', 'yvshomu', 'yvshikh', 'yvshykh', 'yvshui^u', 'yvshai^a', 'yvshoi^u', 'yvshei^u', 'vshi^ui^u', 'vshi^ai^a', 'shchee', 'shchie', 'shchye', 'shchoe', 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', 'shchem', 'shchim', 'shchym', 'shchom', 'ivshee', 'ivshie', 'ivshye', 'ivshoe', 'ivshei`', 'ivshii`', 'ivshyi`', 'ivshoi`', 'ivshem', 'ivshim', 'ivshym', 'ivshom', 'yvshee', 'yvshie', 'yvshye', 'yvshoe', 'yvshei`', 'yvshii`', 'yvshyi`', 'yvshoi`', 'yvshem', 'yvshim', 'yvshym', 'yvshom', 'vshimi', 'vshymi', 'vshego', 'vshogo', 'vshemu', 'vshomu', 'vshikh', 'vshykh', 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u', 'emi^ui^u', 'emi^ai^a', 'nni^ui^u', 'nni^ai^a', 'vshee', 'vshie', 'vshye', 'vshoe', 'vshei`', 'vshii`', 'vshyi`', 'vshoi`', 'vshem', 'vshim', 'vshym', 'vshom', 'emimi', 'emymi', 'emego', 'emogo', 'ememu', 'emomu', 'emikh', 'emykh', 'emui^u', 'emai^a', 'emoi^u', 'emei^u', 'nnimi', 'nnymi', 'nnego', 'nnogo', 'nnemu', 'nnomu', 'nnikh', 'nnykh', 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', 'emee', 'emie', 'emye', 'emoe', 'emei`', 'emii`', 'emyi`', 'emoi`', 'emem', 'emim', 'emym', 'emom', 'nnee', 'nnie', 'nnye', 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`', 'nnem', 'nnim', 'nnym', 'nnom', 'i^ui^u', 'i^ai^a', 'imi', 'ymi', 'ego', 'ogo', 'emu', 'omu', 'ikh', 'ykh', 'ui^u', 'ai^a', 'oi^u', 'ei^u', 'ee', 'ie', 'ye', 'oe', 'ei`', 'ii`', 'yi`', 'oi`', 'em', 'im', 'ym', 'om') __reflexive_suffixes = ("si^a", "s'") __verb_suffixes = ("esh'", 'ei`te', 'ui`te', 'ui^ut', "ish'", 'ete', 'i`te', 'i^ut', 'nno', 'ila', 'yla', 'ena', 'ite', 'ili', 'yli', 'ilo', 'ylo', 'eno', 'i^at', 'uet', 'eny', "it'", "yt'", 'ui^u', 'la', 'na', 'li', 'em', 'lo', 'no', 'et', 'ny', "t'", 'ei`', 'ui`', 'il', 'yl', 'im', 'ym', 'en', 'it', 'yt', 'i^u', 'i`', 'l', 'n') __noun_suffixes = ('ii^ami', 'ii^akh', 'i^ami', 'ii^am', 'i^akh', 'ami', 'iei`', 'i^am', 'iem', 'akh', 'ii^u', "'i^u", 'ii^a', "'i^a", 'ev', 'ov', 'ie', "'e", 'ei', 'ii', 'ei`', 'oi`', 'ii`', 'em', 'am', 'om', 'i^u', 'i^a', 'a', 'e', 'i', 'i`', 'o', 'u', 'y', "'") __superlative_suffixes = ("ei`she", "ei`sh") __derivational_suffixes = ("ost'", "ost") def stem(self, word): """ Stem a Russian word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ if word in self.stopwords: return word chr_exceeded = False for i in range(len(word)): if ord(word[i]) > 255: chr_exceeded = True break if chr_exceeded: word = self.__cyrillic_to_roman(word) step1_success = False adjectival_removed = False verb_removed = False undouble_success = False superlative_removed = False rv, r2 = self.__regions_russian(word) # Step 1 for suffix in self.__perfective_gerund_suffixes: if rv.endswith(suffix): if suffix in ("v", "vshi", "vshis'"): if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or rv[-len(suffix)-1:-len(suffix)] == "a"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] step1_success = True break else: word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] step1_success = True break if not step1_success: for suffix in self.__reflexive_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] break for suffix in self.__adjectival_suffixes: if rv.endswith(suffix): if suffix in ('i^ushchi^ui^u', 'i^ushchi^ai^a', 'i^ushchui^u', 'i^ushchai^a', 'i^ushchoi^u', 'i^ushchei^u', 'i^ushchimi', 'i^ushchymi', 'i^ushchego', 'i^ushchogo', 'i^ushchemu', 'i^ushchomu', 'i^ushchikh', 'i^ushchykh', 'shchi^ui^u', 'shchi^ai^a', 'i^ushchee', 'i^ushchie', 'i^ushchye', 'i^ushchoe', 'i^ushchei`', 'i^ushchii`', 'i^ushchyi`', 'i^ushchoi`', 'i^ushchem', 'i^ushchim', 'i^ushchym', 'i^ushchom', 'vshi^ui^u', 'vshi^ai^a', 'shchui^u', 'shchai^a', 'shchoi^u', 'shchei^u', 'emi^ui^u', 'emi^ai^a', 'nni^ui^u', 'nni^ai^a', 'shchimi', 'shchymi', 'shchego', 'shchogo', 'shchemu', 'shchomu', 'shchikh', 'shchykh', 'vshui^u', 'vshai^a', 'vshoi^u', 'vshei^u', 'shchee', 'shchie', 'shchye', 'shchoe', 'shchei`', 'shchii`', 'shchyi`', 'shchoi`', 'shchem', 'shchim', 'shchym', 'shchom', 'vshimi', 'vshymi', 'vshego', 'vshogo', 'vshemu', 'vshomu', 'vshikh', 'vshykh', 'emui^u', 'emai^a', 'emoi^u', 'emei^u', 'nnui^u', 'nnai^a', 'nnoi^u', 'nnei^u', 'vshee', 'vshie', 'vshye', 'vshoe', 'vshei`', 'vshii`', 'vshyi`', 'vshoi`', 'vshem', 'vshim', 'vshym', 'vshom', 'emimi', 'emymi', 'emego', 'emogo', 'ememu', 'emomu', 'emikh', 'emykh', 'nnimi', 'nnymi', 'nnego', 'nnogo', 'nnemu', 'nnomu', 'nnikh', 'nnykh', 'emee', 'emie', 'emye', 'emoe', 'emei`', 'emii`', 'emyi`', 'emoi`', 'emem', 'emim', 'emym', 'emom', 'nnee', 'nnie', 'nnye', 'nnoe', 'nnei`', 'nnii`', 'nnyi`', 'nnoi`', 'nnem', 'nnim', 'nnym', 'nnom'): if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or rv[-len(suffix)-1:-len(suffix)] == "a"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] adjectival_removed = True break else: word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] adjectival_removed = True break if not adjectival_removed: for suffix in self.__verb_suffixes: if rv.endswith(suffix): if suffix in ("la", "na", "ete", "i`te", "li", "i`", "l", "em", "n", "lo", "no", "et", "i^ut", "ny", "t'", "esh'", "nno"): if (rv[-len(suffix)-3:-len(suffix)] == "i^a" or rv[-len(suffix)-1:-len(suffix)] == "a"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] verb_removed = True break else: word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] verb_removed = True break if not adjectival_removed and not verb_removed: for suffix in self.__noun_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] break # Step 2 if rv.endswith("i"): word = word[:-1] r2 = r2[:-1] # Step 3 for suffix in self.__derivational_suffixes: if r2.endswith(suffix): word = word[:-len(suffix)] break # Step 4 if word.endswith("nn"): word = word[:-1] undouble_success = True if not undouble_success: for suffix in self.__superlative_suffixes: if word.endswith(suffix): word = word[:-len(suffix)] superlative_removed = True break if word.endswith("nn"): word = word[:-1] if not undouble_success and not superlative_removed: if word.endswith("'"): word = word[:-1] if chr_exceeded: word = self.__roman_to_cyrillic(word) return word def __regions_russian(self, word): """ Return the regions RV and R2 which are used by the Russian stemmer. In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel. R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. :param word: The Russian word whose regions RV and R2 are determined. :type word: str or unicode :return: the regions RV and R2 for the respective Russian word. :rtype: tuple :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ r1 = "" r2 = "" rv = "" vowels = ("A", "U", "E", "a", "e", "i", "o", "u", "y") word = (word.replace("i^a", "A") .replace("i^u", "U") .replace("e`", "E")) for i in range(1, len(word)): if word[i] not in vowels and word[i-1] in vowels: r1 = word[i+1:] break for i in range(1, len(r1)): if r1[i] not in vowels and r1[i-1] in vowels: r2 = r1[i+1:] break for i in range(len(word)): if word[i] in vowels: rv = word[i+1:] break r2 = (r2.replace("A", "i^a") .replace("U", "i^u") .replace("E", "e`")) rv = (rv.replace("A", "i^a") .replace("U", "i^u") .replace("E", "e`")) return (rv, r2) def __cyrillic_to_roman(self, word): """ Transliterate a Russian word into the Roman alphabet. A Russian word whose letters consist of the Cyrillic alphabet are transliterated into the Roman alphabet in order to ease the forthcoming stemming process. :param word: The word that is transliterated. :type word: unicode :return: the transliterated word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ word = (word.replace("\u0410", "a").replace("\u0430", "a") .replace("\u0411", "b").replace("\u0431", "b") .replace("\u0412", "v").replace("\u0432", "v") .replace("\u0413", "g").replace("\u0433", "g") .replace("\u0414", "d").replace("\u0434", "d") .replace("\u0415", "e").replace("\u0435", "e") .replace("\u0401", "e").replace("\u0451", "e") .replace("\u0416", "zh").replace("\u0436", "zh") .replace("\u0417", "z").replace("\u0437", "z") .replace("\u0418", "i").replace("\u0438", "i") .replace("\u0419", "i`").replace("\u0439", "i`") .replace("\u041A", "k").replace("\u043A", "k") .replace("\u041B", "l").replace("\u043B", "l") .replace("\u041C", "m").replace("\u043C", "m") .replace("\u041D", "n").replace("\u043D", "n") .replace("\u041E", "o").replace("\u043E", "o") .replace("\u041F", "p").replace("\u043F", "p") .replace("\u0420", "r").replace("\u0440", "r") .replace("\u0421", "s").replace("\u0441", "s") .replace("\u0422", "t").replace("\u0442", "t") .replace("\u0423", "u").replace("\u0443", "u") .replace("\u0424", "f").replace("\u0444", "f") .replace("\u0425", "kh").replace("\u0445", "kh") .replace("\u0426", "t^s").replace("\u0446", "t^s") .replace("\u0427", "ch").replace("\u0447", "ch") .replace("\u0428", "sh").replace("\u0448", "sh") .replace("\u0429", "shch").replace("\u0449", "shch") .replace("\u042A", "''").replace("\u044A", "''") .replace("\u042B", "y").replace("\u044B", "y") .replace("\u042C", "'").replace("\u044C", "'") .replace("\u042D", "e`").replace("\u044D", "e`") .replace("\u042E", "i^u").replace("\u044E", "i^u") .replace("\u042F", "i^a").replace("\u044F", "i^a")) return word def __roman_to_cyrillic(self, word): """ Transliterate a Russian word back into the Cyrillic alphabet. A Russian word formerly transliterated into the Roman alphabet in order to ease the stemming process, is transliterated back into the Cyrillic alphabet, its original form. :param word: The word that is transliterated. :type word: str or unicode :return: word, the transliterated word. :rtype: unicode :note: This helper method is invoked by the stem method of the subclass RussianStemmer. It is not to be invoked directly! """ word = (word.replace("i^u", "\u044E").replace("i^a", "\u044F") .replace("shch", "\u0449").replace("kh", "\u0445") .replace("t^s", "\u0446").replace("ch", "\u0447") .replace("e`", "\u044D").replace("i`", "\u0439") .replace("sh", "\u0448").replace("k", "\u043A") .replace("e", "\u0435").replace("zh", "\u0436") .replace("a", "\u0430").replace("b", "\u0431") .replace("v", "\u0432").replace("g", "\u0433") .replace("d", "\u0434").replace("e", "\u0435") .replace("z", "\u0437").replace("i", "\u0438") .replace("l", "\u043B").replace("m", "\u043C") .replace("n", "\u043D").replace("o", "\u043E") .replace("p", "\u043F").replace("r", "\u0440") .replace("s", "\u0441").replace("t", "\u0442") .replace("u", "\u0443").replace("f", "\u0444") .replace("''", "\u044A").replace("y", "\u044B") .replace("'", "\u044C")) return word class SpanishStemmer(_StandardStemmer): """ The Spanish Snowball stemmer. :cvar __vowels: The Spanish vowels. :type __vowels: unicode :cvar __step0_suffixes: Suffixes to be deleted in step 0 of the algorithm. :type __step0_suffixes: tuple :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2a_suffixes: Suffixes to be deleted in step 2a of the algorithm. :type __step2a_suffixes: tuple :cvar __step2b_suffixes: Suffixes to be deleted in step 2b of the algorithm. :type __step2b_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Spanish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/spanish/stemmer.html """ __vowels = "aeiou\xE1\xE9\xED\xF3\xFA\xFC" __step0_suffixes = ("selas", "selos", "sela", "selo", "las", "les", "los", "nos", "me", "se", "la", "le", "lo") __step1_suffixes = ('amientos', 'imientos', 'amiento', 'imiento', 'aciones', 'uciones', 'adoras', 'adores', 'ancias', 'log\xEDas', 'encias', 'amente', 'idades', 'anzas', 'ismos', 'ables', 'ibles', 'istas', 'adora', 'aci\xF3n', 'antes', 'ancia', 'log\xEDa', 'uci\xf3n', 'encia', 'mente', 'anza', 'icos', 'icas', 'ismo', 'able', 'ible', 'ista', 'osos', 'osas', 'ador', 'ante', 'idad', 'ivas', 'ivos', 'ico', 'ica', 'oso', 'osa', 'iva', 'ivo') __step2a_suffixes = ('yeron', 'yendo', 'yamos', 'yais', 'yan', 'yen', 'yas', 'yes', 'ya', 'ye', 'yo', 'y\xF3') __step2b_suffixes = ('ar\xEDamos', 'er\xEDamos', 'ir\xEDamos', 'i\xE9ramos', 'i\xE9semos', 'ar\xEDais', 'aremos', 'er\xEDais', 'eremos', 'ir\xEDais', 'iremos', 'ierais', 'ieseis', 'asteis', 'isteis', '\xE1bamos', '\xE1ramos', '\xE1semos', 'ar\xEDan', 'ar\xEDas', 'ar\xE9is', 'er\xEDan', 'er\xEDas', 'er\xE9is', 'ir\xEDan', 'ir\xEDas', 'ir\xE9is', 'ieran', 'iesen', 'ieron', 'iendo', 'ieras', 'ieses', 'abais', 'arais', 'aseis', '\xE9amos', 'ar\xE1n', 'ar\xE1s', 'ar\xEDa', 'er\xE1n', 'er\xE1s', 'er\xEDa', 'ir\xE1n', 'ir\xE1s', 'ir\xEDa', 'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'ando', 'abas', 'adas', 'idas', 'aras', 'ases', '\xEDais', 'ados', 'idos', 'amos', 'imos', 'emos', 'ar\xE1', 'ar\xE9', 'er\xE1', 'er\xE9', 'ir\xE1', 'ir\xE9', 'aba', 'ada', 'ida', 'ara', 'ase', '\xEDan', 'ado', 'ido', '\xEDas', '\xE1is', '\xE9is', '\xEDa', 'ad', 'ed', 'id', 'an', 'i\xF3', 'ar', 'er', 'ir', 'as', '\xEDs', 'en', 'es') __step3_suffixes = ("os", "a", "e", "o", "\xE1", "\xE9", "\xED", "\xF3") def stem(self, word): """ Stem a Spanish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word step1_success = False r1, r2 = self._r1r2_standard(word, self.__vowels) rv = self._rv_standard(word, self.__vowels) # STEP 0: Attached pronoun for suffix in self.__step0_suffixes: if not (word.endswith(suffix) and rv.endswith(suffix)): continue if ((rv[:-len(suffix)].endswith(("ando", "\xE1ndo", "ar", "\xE1r", "er", "\xE9r", "iendo", "i\xE9ndo", "ir", "\xEDr"))) or (rv[:-len(suffix)].endswith("yendo") and word[:-len(suffix)].endswith("uyendo"))): word = self.__replace_accented(word[:-len(suffix)]) r1 = self.__replace_accented(r1[:-len(suffix)]) r2 = self.__replace_accented(r2[:-len(suffix)]) rv = self.__replace_accented(rv[:-len(suffix)]) break # STEP 1: Standard suffix removal for suffix in self.__step1_suffixes: if not word.endswith(suffix): continue if suffix == "amente" and r1.endswith(suffix): step1_success = True word = word[:-6] r2 = r2[:-6] rv = rv[:-6] if r2.endswith("iv"): word = word[:-2] r2 = r2[:-2] rv = rv[:-2] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] elif r2.endswith(("os", "ic", "ad")): word = word[:-2] rv = rv[:-2] elif r2.endswith(suffix): step1_success = True if suffix in ("adora", "ador", "aci\xF3n", "adoras", "adores", "aciones", "ante", "antes", "ancia", "ancias"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith("ic"): word = word[:-2] rv = rv[:-2] elif suffix in ("log\xEDa", "log\xEDas"): word = suffix_replace(word, suffix, "log") rv = suffix_replace(rv, suffix, "log") elif suffix in ("uci\xF3n", "uciones"): word = suffix_replace(word, suffix, "u") rv = suffix_replace(rv, suffix, "u") elif suffix in ("encia", "encias"): word = suffix_replace(word, suffix, "ente") rv = suffix_replace(rv, suffix, "ente") elif suffix == "mente": word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith(("ante", "able", "ible")): word = word[:-4] rv = rv[:-4] elif suffix in ("idad", "idades"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] for pre_suff in ("abil", "ic", "iv"): if r2.endswith(pre_suff): word = word[:-len(pre_suff)] rv = rv[:-len(pre_suff)] elif suffix in ("ivo", "iva", "ivos", "ivas"): word = word[:-len(suffix)] r2 = r2[:-len(suffix)] rv = rv[:-len(suffix)] if r2.endswith("at"): word = word[:-2] rv = rv[:-2] else: word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 2a: Verb suffixes beginning 'y' if not step1_success: for suffix in self.__step2a_suffixes: if (rv.endswith(suffix) and word[-len(suffix)-1:-len(suffix)] == "u"): word = word[:-len(suffix)] rv = rv[:-len(suffix)] break # STEP 2b: Other verb suffixes for suffix in self.__step2b_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] rv = rv[:-len(suffix)] if suffix in ("en", "es", "\xE9is", "emos"): if word.endswith("gu"): word = word[:-1] if rv.endswith("gu"): rv = rv[:-1] break # STEP 3: Residual suffix for suffix in self.__step3_suffixes: if rv.endswith(suffix): word = word[:-len(suffix)] if suffix in ("e", "\xE9"): rv = rv[:-len(suffix)] if word[-2:] == "gu" and rv.endswith("u"): word = word[:-1] break word = self.__replace_accented(word) return word def __replace_accented(self, word): """ Replaces all accented letters on a word with their non-accented counterparts. :param word: A spanish word, with or without accents :type word: str or unicode :return: a word with the accented letters (á, é, í, ó, ú) replaced with their non-accented counterparts (a, e, i, o, u) :rtype: str or unicode """ return (word.replace("\xE1", "a") .replace("\xE9", "e") .replace("\xED", "i") .replace("\xF3", "o") .replace("\xFA", "u")) class SwedishStemmer(_ScandinavianStemmer): """ The Swedish Snowball stemmer. :cvar __vowels: The Swedish vowels. :type __vowels: unicode :cvar __s_ending: Letters that may directly appear before a word final 's'. :type __s_ending: unicode :cvar __step1_suffixes: Suffixes to be deleted in step 1 of the algorithm. :type __step1_suffixes: tuple :cvar __step2_suffixes: Suffixes to be deleted in step 2 of the algorithm. :type __step2_suffixes: tuple :cvar __step3_suffixes: Suffixes to be deleted in step 3 of the algorithm. :type __step3_suffixes: tuple :note: A detailed description of the Swedish stemming algorithm can be found under http://snowball.tartarus.org/algorithms/swedish/stemmer.html """ __vowels = "aeiouy\xE4\xE5\xF6" __s_ending = "bcdfghjklmnoprtvy" __step1_suffixes = ("heterna", "hetens", "heter", "heten", "anden", "arnas", "ernas", "ornas", "andes", "andet", "arens", "arna", "erna", "orna", "ande", "arne", "aste", "aren", "ades", "erns", "ade", "are", "ern", "ens", "het", "ast", "ad", "en", "ar", "er", "or", "as", "es", "at", "a", "e", "s") __step2_suffixes = ("dd", "gd", "nn", "dt", "gt", "kt", "tt") __step3_suffixes = ("fullt", "l\xF6st", "els", "lig", "ig") def stem(self, word): """ Stem a Swedish word and return the stemmed form. :param word: The word that is stemmed. :type word: str or unicode :return: The stemmed form. :rtype: unicode """ word = word.lower() if word in self.stopwords: return word r1 = self._r1_scandinavian(word, self.__vowels) # STEP 1 for suffix in self.__step1_suffixes: if r1.endswith(suffix): if suffix == "s": if word[-2] in self.__s_ending: word = word[:-1] r1 = r1[:-1] else: word = word[:-len(suffix)] r1 = r1[:-len(suffix)] break # STEP 2 for suffix in self.__step2_suffixes: if r1.endswith(suffix): word = word[:-1] r1 = r1[:-1] break # STEP 3 for suffix in self.__step3_suffixes: if r1.endswith(suffix): if suffix in ("els", "lig", "ig"): word = word[:-len(suffix)] elif suffix in ("fullt", "l\xF6st"): word = word[:-1] break return word def demo(): """ This function provides a demonstration of the Snowball stemmers. After invoking this function and specifying a language, it stems an excerpt of the Universal Declaration of Human Rights (which is a part of the NLTK corpus collection) and then prints out the original and the stemmed text. """ import re from nltk.corpus import udhr udhr_corpus = {"danish": "Danish_Dansk-Latin1", "dutch": "Dutch_Nederlands-Latin1", "english": "English-Latin1", "finnish": "Finnish_Suomi-Latin1", "french": "French_Francais-Latin1", "german": "German_Deutsch-Latin1", "hungarian": "Hungarian_Magyar-UTF8", "italian": "Italian_Italiano-Latin1", "norwegian": "Norwegian-Latin1", "porter": "English-Latin1", "portuguese": "Portuguese_Portugues-Latin1", "romanian": "Romanian_Romana-Latin2", "russian": "Russian-UTF8", "spanish": "Spanish-Latin1", "swedish": "Swedish_Svenska-Latin1", } print("\n") print("******************************") print("Demo for the Snowball stemmers") print("******************************") while True: language = compat.raw_input("Please enter the name of the language " + "to be demonstrated\n" + "/".join(SnowballStemmer.languages) + "\n" + "(enter 'exit' in order to leave): ") if language == "exit": break if language not in SnowballStemmer.languages: print(("\nOops, there is no stemmer for this language. " + "Please try again.\n")) continue stemmer = SnowballStemmer(language) excerpt = udhr.words(udhr_corpus[language]) [:300] stemmed = " ".join(stemmer.stem(word) for word in excerpt) stemmed = re.sub(r"(.{,70})\s", r'\1\n', stemmed+' ').rstrip() excerpt = " ".join(excerpt) excerpt = re.sub(r"(.{,70})\s", r'\1\n', excerpt+' ').rstrip() print("\n") print('-' * 70) print('ORIGINAL'.center(70)) print(excerpt) print("\n\n") print('STEMMED RESULTS'.center(70)) print(stemmed) print('-' * 70) print("\n") nltk-3.1/nltk/stem/util.py0000644000076500000240000000054212607224144015326 0ustar sbstaff00000000000000# Natural Language Toolkit: Stemmer Utilities # # Copyright (C) 2001-2015 NLTK Project # Author: Helder # URL: # For license information, see LICENSE.TXT def suffix_replace(original, old, new): """ Replaces the old suffix of the original string by a new suffix """ return original[:-len(old)] + new nltk-3.1/nltk/stem/wordnet.py0000644000076500000240000000256112607224144016036 0ustar sbstaff00000000000000# Natural Language Toolkit: WordNet stemmer interface # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals from nltk.corpus.reader.wordnet import NOUN from nltk.corpus import wordnet from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class WordNetLemmatizer(object): """ WordNet Lemmatizer Lemmatize using WordNet's built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet. >>> from nltk.stem import WordNetLemmatizer >>> wnl = WordNetLemmatizer() >>> print(wnl.lemmatize('dogs')) dog >>> print(wnl.lemmatize('churches')) church >>> print(wnl.lemmatize('aardwolves')) aardwolf >>> print(wnl.lemmatize('abaci')) abacus >>> print(wnl.lemmatize('hardrock')) hardrock """ def __init__(self): pass def lemmatize(self, word, pos=NOUN): lemmas = wordnet._morphy(word, pos) return min(lemmas, key=len) if lemmas else word def __repr__(self): return '' # unload wordnet def teardown_module(module=None): from nltk.corpus import wordnet wordnet._unload() nltk-3.1/nltk/tag/0000755000076500000240000000000012610001541013565 5ustar sbstaff00000000000000nltk-3.1/nltk/tag/__init__.py0000644000076500000240000001212012607351371015711 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Taggers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT """ NLTK Taggers This package contains classes and interfaces for part-of-speech tagging, or simply "tagging". A "tag" is a case-sensitive string that specifies some property of a token, such as its part of speech. Tagged tokens are encoded as tuples ``(tag, token)``. For example, the following tagged token combines the word ``'fly'`` with a noun part of speech tag (``'NN'``): >>> tagged_tok = ('fly', 'NN') An off-the-shelf tagger is available. It uses the Penn Treebank tagset: >>> from nltk import pos_tag, word_tokenize >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] This package defines several taggers, which take a list of tokens, assign a tag to each one, and return the resulting list of tagged tokens. Most of the taggers are built automatically based on a training corpus. For example, the unigram tagger tags each word *w* by checking what the most frequent tag for *w* was in a training corpus: >>> from nltk.corpus import brown >>> from nltk.tag import UnigramTagger >>> tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) >>> sent = ['Mitchell', 'decried', 'the', 'high', 'rate', 'of', 'unemployment'] >>> for word, tag in tagger.tag(sent): ... print(word, '->', tag) Mitchell -> NP decried -> None the -> AT high -> JJ rate -> NN of -> IN unemployment -> None Note that words that the tagger has not seen during training receive a tag of ``None``. We evaluate a tagger on data that was not seen during training: >>> tagger.evaluate(brown.tagged_sents(categories='news')[500:600]) 0.73... For more information, please consult chapter 5 of the NLTK Book. """ from __future__ import print_function from nltk.tag.api import TaggerI from nltk.tag.util import str2tuple, tuple2str, untag from nltk.tag.sequential import (SequentialBackoffTagger, ContextTagger, DefaultTagger, NgramTagger, UnigramTagger, BigramTagger, TrigramTagger, AffixTagger, RegexpTagger, ClassifierBasedTagger, ClassifierBasedPOSTagger) from nltk.tag.brill import BrillTagger from nltk.tag.brill_trainer import BrillTaggerTrainer from nltk.tag.tnt import TnT from nltk.tag.hunpos import HunposTagger from nltk.tag.stanford import StanfordTagger, StanfordPOSTagger, StanfordNERTagger from nltk.tag.hmm import HiddenMarkovModelTagger, HiddenMarkovModelTrainer from nltk.tag.senna import SennaTagger, SennaChunkTagger, SennaNERTagger from nltk.tag.mapping import tagset_mapping, map_tag from nltk.tag.crf import CRFTagger from nltk.tag.perceptron import PerceptronTagger from nltk.data import load def _pos_tag(tokens, tagset, tagger): tagged_tokens = tagger.tag(tokens) if tagset: tagged_tokens = [(token, map_tag('en-ptb', tagset, tag)) for (token, tag) in tagged_tokens] return tagged_tokens def pos_tag(tokens, tagset=None): """ Use NLTK's currently recommended part of speech tagger to tag the given list of tokens. >>> from nltk.tag import pos_tag >>> from nltk.tokenize import word_tokenize >>> pos_tag(word_tokenize("John's big idea isn't all that bad.")) [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'PDT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] >>> pos_tag(word_tokenize("John's big idea isn't all that bad."), tagset='universal') [('John', 'NOUN'), ("'s", 'PRT'), ('big', 'ADJ'), ('idea', 'NOUN'), ('is', 'VERB'), ("n't", 'ADV'), ('all', 'DET'), ('that', 'DET'), ('bad', 'ADJ'), ('.', '.')] NB. Use `pos_tag_sents()` for efficient tagging of more than one sentence. :param tokens: Sequence of tokens to be tagged :type tokens: list(str) :param tagset: the tagset to be used, e.g. universal, wsj, brown :type tagset: str :return: The tagged tokens :rtype: list(tuple(str, str)) """ tagger = PerceptronTagger() return _pos_tag(tokens, tagset, tagger) def pos_tag_sents(sentences, tagset=None): """ Use NLTK's currently recommended part of speech tagger to tag the given list of sentences, each consisting of a list of tokens. :param tokens: List of sentences to be tagged :type tokens: list(list(str)) :param tagset: the tagset to be used, e.g. universal, wsj, brown :type tagset: str :return: The list of tagged sentences :rtype: list(list(tuple(str, str))) """ tagger = PerceptronTagger() return [_pos_tag(sent, tagset, tagger) for sent in sentences] nltk-3.1/nltk/tag/api.py0000644000076500000240000000536012607224144014730 0ustar sbstaff00000000000000# Natural Language Toolkit: Tagger Interface # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT """ Interface for tagging each token in a sentence with supplementary information, such as its part of speech. """ from nltk.internals import overridden from nltk.metrics import accuracy from nltk.tag.util import untag class TaggerI(object): """ A processing interface for assigning a tag to each token in a list. Tags are case sensitive strings that identify some property of each token, such as its part of speech or its sense. Some taggers require specific types for their tokens. This is generally indicated by the use of a sub-interface to ``TaggerI``. For example, featureset taggers, which are subclassed from ``FeaturesetTagger``, require that each token be a ``featureset``. Subclasses must define: - either ``tag()`` or ``tag_sents()`` (or both) """ def tag(self, tokens): """ Determine the most appropriate tag sequence for the given token sequence, and return a corresponding list of tagged tokens. A tagged token is encoded as a tuple ``(token, tag)``. :rtype: list(tuple(str, str)) """ if overridden(self.tag_sents): return self.tag_sents([tokens])[0] else: raise NotImplementedError() def tag_sents(self, sentences): """ Apply ``self.tag()`` to each element of *sentences*. I.e.: return [self.tag(sent) for sent in sentences] """ return [self.tag(sent) for sent in sentences] def evaluate(self, gold): """ Score the accuracy of the tagger against the gold standard. Strip the tags from the gold standard text, retag it using the tagger, then compute the accuracy score. :type gold: list(list(tuple(str, str))) :param gold: The list of tagged sentences to score the tagger on. :rtype: float """ tagged_sents = self.tag_sents(untag(sent) for sent in gold) gold_tokens = sum(gold, []) test_tokens = sum(tagged_sents, []) return accuracy(gold_tokens, test_tokens) def _check_params(self, train, model): if (train and model) or (not train and not model): raise ValueError('Must specify either training data or trained model.') class FeaturesetTaggerI(TaggerI): """ A tagger that requires tokens to be ``featuresets``. A featureset is a dictionary that maps from feature names to feature values. See ``nltk.classify`` for more information about features and featuresets. """ nltk-3.1/nltk/tag/brill.py0000644000076500000240000003742212607224144015267 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2015 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function, division from collections import defaultdict from nltk.compat import Counter from nltk.tag import TaggerI from nltk.tbl import Feature, Template from nltk import jsontags ###################################################################### # Brill Templates ###################################################################### @jsontags.register_tag class Word(Feature): """ Feature which examines the text (word) of nearby tokens. """ json_tag = 'nltk.tag.brill.Word' @staticmethod def extract_property(tokens, index): """@return: The given token's text.""" return tokens[index][0] @jsontags.register_tag class Pos(Feature): """ Feature which examines the tags of nearby tokens. """ json_tag = 'nltk.tag.brill.Pos' @staticmethod def extract_property(tokens, index): """@return: The given token's tag.""" return tokens[index][1] def nltkdemo18(): """ Return 18 templates, from the original nltk demo, in multi-feature syntax """ return [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-3, -2, -1])), Template(Word([1, 2, 3])), Template(Word([-1]), Word([1])), ] def nltkdemo18plus(): """ Return 18 templates, from the original nltk demo, and additionally a few multi-feature ones (the motivation is easy comparison with nltkdemo18) """ return nltkdemo18() + [ Template(Word([-1]), Pos([1])), Template(Pos([-1]), Word([1])), Template(Word([-1]), Word([0]), Pos([1])), Template(Pos([-1]), Word([0]), Word([1])), Template(Pos([-1]), Word([0]), Pos([1])), ] def fntbl37(): """ Return 37 templates taken from the postagging task of the fntbl distribution http://www.cs.jhu.edu/~rflorian/fntbl/ (37 is after excluding a handful which do not condition on Pos[0]; fntbl can do that but the current nltk implementation cannot.) """ return [ Template(Word([0]), Word([1]), Word([2])), Template(Word([-1]), Word([0]), Word([1])), Template(Word([0]), Word([-1])), Template(Word([0]), Word([1])), Template(Word([0]), Word([2])), Template(Word([0]), Word([-2])), Template(Word([1, 2])), Template(Word([-2, -1])), Template(Word([1, 2, 3])), Template(Word([-3, -2, -1])), Template(Word([0]), Pos([2])), Template(Word([0]), Pos([-2])), Template(Word([0]), Pos([1])), Template(Word([0]), Pos([-1])), Template(Word([0])), Template(Word([-2])), Template(Word([2])), Template(Word([1])), Template(Word([-1])), Template(Pos([-1]), Pos([1])), Template(Pos([1]), Pos([2])), Template(Pos([-1]), Pos([-2])), Template(Pos([1])), Template(Pos([-1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([1, 2, 3])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([-2, -1])), Template(Pos([1]), Word([0]), Word([1])), Template(Pos([1]), Word([0]), Word([-1])), Template(Pos([-1]), Word([-1]), Word([0])), Template(Pos([-1]), Word([0]), Word([1])), Template(Pos([-2]), Pos([-1])), Template(Pos([1]), Pos([2])), Template(Pos([1]), Pos([2]), Word([1])) ] def brill24(): """ Return 24 templates of the seminal TBL paper, Brill (1995) """ return [ Template(Pos([-1])), Template(Pos([1])), Template(Pos([-2])), Template(Pos([2])), Template(Pos([-2, -1])), Template(Pos([1, 2])), Template(Pos([-3, -2, -1])), Template(Pos([1, 2, 3])), Template(Pos([-1]), Pos([1])), Template(Pos([-2]), Pos([-1])), Template(Pos([1]), Pos([2])), Template(Word([-1])), Template(Word([1])), Template(Word([-2])), Template(Word([2])), Template(Word([-2, -1])), Template(Word([1, 2])), Template(Word([-1, 0])), Template(Word([0, 1])), Template(Word([0])), Template(Word([-1]), Pos([-1])), Template(Word([1]), Pos([1])), Template(Word([0]), Word([-1]), Pos([-1])), Template(Word([0]), Word([1]), Pos([1])), ] def describe_template_sets(): """ Print the available template sets in this demo, with a short description" """ import inspect import sys # a bit of magic to get all functions in this module templatesets = inspect.getmembers(sys.modules[__name__], inspect.isfunction) for (name, obj) in templatesets: if name == "describe_template_sets": continue print(name, obj.__doc__, "\n") ###################################################################### # The Brill Tagger ###################################################################### @jsontags.register_tag class BrillTagger(TaggerI): """ Brill's transformational rule-based tagger. Brill taggers use an initial tagger (such as ``tag.DefaultTagger``) to assign an initial tag sequence to a text; and then apply an ordered list of transformational rules to correct the tags of individual tokens. These transformation rules are specified by the ``TagRule`` interface. Brill taggers can be created directly, from an initial tagger and a list of transformational rules; but more often, Brill taggers are created by learning rules from a training corpus, using one of the TaggerTrainers available. """ json_tag = 'nltk.tag.BrillTagger' def __init__(self, initial_tagger, rules, training_stats=None): """ :param initial_tagger: The initial tagger :type initial_tagger: TaggerI :param rules: An ordered list of transformation rules that should be used to correct the initial tagging. :type rules: list(TagRule) :param training_stats: A dictionary of statistics collected during training, for possible later use :type training_stats: dict """ self._initial_tagger = initial_tagger self._rules = tuple(rules) self._training_stats = training_stats def encode_json_obj(self): return self._initial_tagger, self._rules, self._training_stats @classmethod def decode_json_obj(cls, obj): _initial_tagger, _rules, _training_stats = obj return cls(_initial_tagger, _rules, _training_stats) def rules(self): """ Return the ordered list of transformation rules that this tagger has learnt :return: the ordered list of transformation rules that correct the initial tagging :rtype: list of Rules """ return self._rules def train_stats(self, statistic=None): """ Return a named statistic collected during training, or a dictionary of all available statistics if no name given :param statistic: name of statistic :type statistic: str :return: some statistic collected during training of this tagger :rtype: any (but usually a number) """ if statistic is None: return self._training_stats else: return self._training_stats.get(statistic) def tag(self, tokens): # Inherit documentation from TaggerI # Run the initial tagger. tagged_tokens = self._initial_tagger.tag(tokens) # Create a dictionary that maps each tag to a list of the # indices of tokens that have that tag. tag_to_positions = defaultdict(set) for i, (token, tag) in enumerate(tagged_tokens): tag_to_positions[tag].add(i) # Apply each rule, in order. Only try to apply rules at # positions that have the desired original tag. for rule in self._rules: # Find the positions where it might apply positions = tag_to_positions.get(rule.original_tag, []) # Apply the rule at those positions. changed = rule.apply(tagged_tokens, positions) # Update tag_to_positions with the positions of tags that # were modified. for i in changed: tag_to_positions[rule.original_tag].remove(i) tag_to_positions[rule.replacement_tag].add(i) return tagged_tokens def print_template_statistics(self, test_stats=None, printunused=True): """ Print a list of all templates, ranked according to efficiency. If test_stats is available, the templates are ranked according to their relative contribution (summed for all rules created from a given template, weighted by score) to the performance on the test set. If no test_stats, then statistics collected during training are used instead. There is also an unweighted measure (just counting the rules). This is less informative, though, as many low-score rules will appear towards end of training. :param test_stats: dictionary of statistics collected during testing :type test_stats: dict of str -> any (but usually numbers) :param printunused: if True, print a list of all unused templates :type printunused: bool :return: None :rtype: None """ tids = [r.templateid for r in self._rules] train_stats = self.train_stats() trainscores = train_stats['rulescores'] assert len(trainscores) == len(tids), "corrupt statistics: " \ "{0} train scores for {1} rules".format(trainscores, tids) template_counts = Counter(tids) weighted_traincounts = Counter() for (tid, score) in zip(tids, trainscores): weighted_traincounts[tid] += score tottrainscores = sum(trainscores) # det_tplsort() is for deterministic sorting; # the otherwise convenient Counter.most_common() unfortunately # does not break ties deterministically # between python versions and will break cross-version tests def det_tplsort(tpl_value): return (tpl_value[1], repr(tpl_value[0])) def print_train_stats(): print("TEMPLATE STATISTICS (TRAIN) {0} templates, {1} rules)".format( len(template_counts), len(tids)) ) print("TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)) head = "#ID | Score (train) | #Rules | Template" print(head, "\n", "-" * len(head), sep="") train_tplscores = sorted(weighted_traincounts.items(), key=det_tplsort, reverse=True) for (tid, trainscore) in train_tplscores: s = "{0} | {1:5d} {2:5.3f} |{3:4d} {4:.3f} | {5}".format( tid, trainscore, trainscore/tottrainscores, template_counts[tid], template_counts[tid]/len(tids), Template.ALLTEMPLATES[int(tid)], ) print(s) def print_testtrain_stats(): testscores = test_stats['rulescores'] print("TEMPLATE STATISTICS (TEST AND TRAIN) ({0} templates, {1} rules)".format( len(template_counts), len(tids)), ) print("TEST ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " "final: {finalerrors:5d} {finalacc:.4f} ".format(**test_stats)) print("TRAIN ({tokencount:7d} tokens) initial {initialerrors:5d} {initialacc:.4f} " "final: {finalerrors:5d} {finalacc:.4f} ".format(**train_stats)) weighted_testcounts = Counter() for (tid, score) in zip(tids, testscores): weighted_testcounts[tid] += score tottestscores = sum(testscores) head = "#ID | Score (test) | Score (train) | #Rules | Template" print(head, "\n", "-" * len(head), sep="") test_tplscores = sorted(weighted_testcounts.items(), key=det_tplsort, reverse=True) for (tid, testscore) in test_tplscores: s = "{0:s} |{1:5d} {2:6.3f} | {3:4d} {4:.3f} |{5:4d} {6:.3f} | {7:s}".format( tid, testscore, testscore/tottestscores, weighted_traincounts[tid], weighted_traincounts[tid]/tottrainscores, template_counts[tid], template_counts[tid]/len(tids), Template.ALLTEMPLATES[int(tid)], ) print(s) def print_unused_templates(): usedtpls = set([int(tid) for tid in tids]) unused = [(tid, tpl) for (tid, tpl) in enumerate(Template.ALLTEMPLATES) if tid not in usedtpls] print("UNUSED TEMPLATES ({0})".format(len(unused))) for (tid, tpl) in unused: print("{0:03d} {1:s}".format(tid, tpl)) if test_stats is None: print_train_stats() else: print_testtrain_stats() print() if printunused: print_unused_templates() print() def batch_tag_incremental(self, sequences, gold): """ Tags by applying each rule to the entire corpus (rather than all rules to a single sequence). The point is to collect statistics on the test set for individual rules. NOTE: This is inefficient (does not build any index, so will traverse the entire corpus N times for N rules) -- usually you would not care about statistics for individual rules and thus use batch_tag() instead :param sequences: lists of token sequences (sentences, in some applications) to be tagged :type sequences: list of list of strings :param gold: the gold standard :type gold: list of list of strings :returns: tuple of (tagged_sequences, ordered list of rule scores (one for each rule)) """ def counterrors(xs): return sum(t[1] != g[1] for pair in zip(xs, gold) for (t, g) in zip(*pair)) testing_stats = {} testing_stats['tokencount'] = sum(len(t) for t in sequences) testing_stats['sequencecount'] = len(sequences) tagged_tokenses = [self._initial_tagger.tag(tokens) for tokens in sequences] testing_stats['initialerrors'] = counterrors(tagged_tokenses) testing_stats['initialacc'] = 1 - testing_stats['initialerrors']/testing_stats['tokencount'] # Apply each rule to the entire corpus, in order errors = [testing_stats['initialerrors']] for rule in self._rules: for tagged_tokens in tagged_tokenses: rule.apply(tagged_tokens) errors.append(counterrors(tagged_tokenses)) testing_stats['rulescores'] = [err0 - err1 for (err0, err1) in zip(errors, errors[1:])] testing_stats['finalerrors'] = errors[-1] testing_stats['finalacc'] = 1 - testing_stats['finalerrors']/testing_stats['tokencount'] return (tagged_tokenses, testing_stats) nltk-3.1/nltk/tag/brill_trainer.py0000644000076500000240000006504012607224144017010 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2013 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function, division import bisect import textwrap from collections import defaultdict from nltk.tag import untag, BrillTagger ###################################################################### # Brill Tagger Trainer ###################################################################### class BrillTaggerTrainer(object): """ A trainer for tbl taggers. """ def __init__(self, initial_tagger, templates, trace=0, deterministic=None, ruleformat="str"): """ Construct a Brill tagger from a baseline tagger and a set of templates :param initial_tagger: the baseline tagger :type initial_tagger: Tagger :param templates: templates to be used in training :type templates: list of Templates :param trace: verbosity level :type trace: int :param deterministic: if True, adjudicate ties deterministically :type deterministic: bool :param ruleformat: format of reported Rules :type ruleformat: str :return: An untrained BrillTagger :rtype: BrillTagger """ if deterministic is None: deterministic = (trace > 0) self._initial_tagger = initial_tagger self._templates = templates self._trace = trace self._deterministic = deterministic self._ruleformat = ruleformat self._tag_positions = None """Mapping from tags to lists of positions that use that tag.""" self._rules_by_position = None """Mapping from positions to the set of rules that are known to occur at that position. Position is (sentnum, wordnum). Initially, this will only contain positions where each rule applies in a helpful way; but when we examine a rule, we'll extend this list to also include positions where each rule applies in a harmful or neutral way.""" self._positions_by_rule = None """Mapping from rule to position to effect, specifying the effect that each rule has on the overall score, at each position. Position is (sentnum, wordnum); and effect is -1, 0, or 1. As with _rules_by_position, this mapping starts out only containing rules with positive effects; but when we examine a rule, we'll extend this mapping to include the positions where the rule is harmful or neutral.""" self._rules_by_score = None """Mapping from scores to the set of rules whose effect on the overall score is upper bounded by that score. Invariant: rulesByScore[s] will contain r iff the sum of _positions_by_rule[r] is s.""" self._rule_scores = None """Mapping from rules to upper bounds on their effects on the overall score. This is the inverse mapping to _rules_by_score. Invariant: ruleScores[r] = sum(_positions_by_rule[r])""" self._first_unknown_position = None """Mapping from rules to the first position where we're unsure if the rule applies. This records the next position we need to check to see if the rule messed anything up.""" # Training def train(self, train_sents, max_rules=200, min_score=2, min_acc=None): """ Trains the Brill tagger on the corpus *train_sents*, producing at most *max_rules* transformations, each of which reduces the net number of errors in the corpus by at least *min_score*, and each of which has accuracy not lower than *min_acc*. #imports >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Pos, Word >>> from nltk.tag import RegexpTagger, BrillTaggerTrainer #some data >>> from nltk.corpus import treebank >>> training_data = treebank.tagged_sents()[:100] >>> baseline_data = treebank.tagged_sents()[100:200] >>> gold_data = treebank.tagged_sents()[200:300] >>> testing_data = [untag(s) for s in gold_data] >>> backoff = RegexpTagger([ ... (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) >>> baseline = backoff #see NOTE1 >>> baseline.evaluate(gold_data) #doctest: +ELLIPSIS 0.2450142... #templates >>> Template._cleartemplates() #clear any templates created in earlier tests >>> templates = [Template(Pos([-1])), Template(Pos([-1]), Word([0]))] #construct a BrillTaggerTrainer >>> tt = BrillTaggerTrainer(baseline, templates, trace=3) >>> tagger1 = tt.train(training_data, max_rules=10) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: None) Finding initial useful rules... Found 845 useful rules. B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 47 63 16 161 | NN->IN if Pos:NNS@[-1] 33 33 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | IN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | IN->, if Pos:NNS@[-1] & Word:,@[0] 22 27 5 24 | NN->-NONE- if Pos:VBD@[-1] 17 17 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger1.rules()[1:3] (Rule('001', 'NN', ',', [(Pos([-1]),'NN'), (Word([0]),',')]), Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')])) >>> train_stats = tagger1.train_stats() >>> [train_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1775, 1269, [132, 85, 69, 51, 47, 33, 26, 24, 22, 17]] >>> tagger1.print_template_statistics(printunused=False) TEMPLATE STATISTICS (TRAIN) 2 templates, 10 rules) TRAIN ( 2417 tokens) initial 1775 0.2656 final: 1269 0.4750 #ID | Score (train) | #Rules | Template -------------------------------------------- 001 | 305 0.603 | 7 0.700 | Template(Pos([-1]),Word([0])) 000 | 201 0.397 | 3 0.300 | Template(Pos([-1])) >>> tagger1.evaluate(gold_data) # doctest: +ELLIPSIS 0.43996... >>> tagged, test_stats = tagger1.batch_tag_incremental(testing_data, gold_data) >>> tagged[33][12:] == [('foreign', 'IN'), ('debt', 'NN'), ('of', 'IN'), ('$', 'NN'), ('64', 'CD'), ... ('billion', 'NN'), ('*U*', 'NN'), ('--', 'NN'), ('the', 'DT'), ('third-highest', 'NN'), ('in', 'NN'), ... ('the', 'DT'), ('developing', 'VBG'), ('world', 'NN'), ('.', '.')] True >>> [test_stats[stat] for stat in ['initialerrors', 'finalerrors', 'rulescores']] [1855, 1376, [100, 85, 67, 58, 27, 36, 27, 16, 31, 32]] # a high-accuracy tagger >>> tagger2 = tt.train(training_data, max_rules=10, min_acc=0.99) TBL train (fast) (seqs: 100; tokens: 2417; tpls: 2; min score: 2; min acc: 0.99) Finding initial useful rules... Found 845 useful rules. B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- 132 132 0 0 | AT->DT if Pos:NN@[-1] 85 85 0 0 | NN->, if Pos:NN@[-1] & Word:,@[0] 69 69 0 0 | NN->. if Pos:NN@[-1] & Word:.@[0] 51 51 0 0 | NN->IN if Pos:NN@[-1] & Word:of@[0] 36 36 0 0 | NN->TO if Pos:NN@[-1] & Word:to@[0] 26 26 0 0 | NN->. if Pos:NNS@[-1] & Word:.@[0] 24 24 0 0 | NN->, if Pos:NNS@[-1] & Word:,@[0] 19 19 0 6 | NN->VB if Pos:TO@[-1] 18 18 0 0 | CD->-NONE- if Pos:NN@[-1] & Word:0@[0] 18 18 0 0 | NN->CC if Pos:NN@[-1] & Word:and@[0] >>> tagger2.evaluate(gold_data) # doctest: +ELLIPSIS 0.44159544... >>> tagger2.rules()[2:4] (Rule('001', 'NN', '.', [(Pos([-1]),'NN'), (Word([0]),'.')]), Rule('001', 'NN', 'IN', [(Pos([-1]),'NN'), (Word([0]),'of')])) # NOTE1: (!!FIXME) A far better baseline uses nltk.tag.UnigramTagger, # with a RegexpTagger only as backoff. For instance, # >>> baseline = UnigramTagger(baseline_data, backoff=backoff) # However, as of Nov 2013, nltk.tag.UnigramTagger does not yield consistent results # between python versions. The simplistic backoff above is a workaround to make doctests # get consistent input. :param train_sents: training data :type train_sents: list(list(tuple)) :param max_rules: output at most max_rules rules :type max_rules: int :param min_score: stop training when no rules better than min_score can be found :type min_score: int :param min_acc: discard any rule with lower accuracy than min_acc :type min_acc: float or None :return: the learned tagger :rtype: BrillTagger """ # FIXME: several tests are a bit too dependent on tracing format # FIXME: tests in trainer.fast and trainer.brillorig are exact duplicates # Basic idea: Keep track of the rules that apply at each position. # And keep track of the positions to which each rule applies. # Create a new copy of the training corpus, and run the # initial tagger on it. We will progressively update this # test corpus to look more like the training corpus. test_sents = [list(self._initial_tagger.tag(untag(sent))) for sent in train_sents] # Collect some statistics on the training process trainstats = {} trainstats['min_acc'] = min_acc trainstats['min_score'] = min_score trainstats['tokencount'] = sum(len(t) for t in test_sents) trainstats['sequencecount'] = len(test_sents) trainstats['templatecount'] = len(self._templates) trainstats['rulescores'] = [] trainstats['initialerrors'] = sum( tag[1] != truth[1] for paired in zip(test_sents, train_sents) for (tag, truth) in zip(*paired) ) trainstats['initialacc'] = 1 - trainstats['initialerrors']/trainstats['tokencount'] if self._trace > 0: print("TBL train (fast) (seqs: {sequencecount}; tokens: {tokencount}; " "tpls: {templatecount}; min score: {min_score}; min acc: {min_acc})".format(**trainstats)) # Initialize our mappings. This will find any errors made # by the initial tagger, and use those to generate repair # rules, which are added to the rule mappings. if self._trace: print("Finding initial useful rules...") self._init_mappings(test_sents, train_sents) if self._trace: print((" Found %d useful rules." % len(self._rule_scores))) # Let the user know what we're up to. if self._trace > 2: self._trace_header() elif self._trace == 1: print("Selecting rules...") # Repeatedly select the best rule, and add it to `rules`. rules = [] try: while (len(rules) < max_rules): # Find the best rule, and add it to our rule list. rule = self._best_rule(train_sents, test_sents, min_score, min_acc) if rule: rules.append(rule) score = self._rule_scores[rule] trainstats['rulescores'].append(score) else: break # No more good rules left! # Report the rule that we found. if self._trace > 1: self._trace_rule(rule) # Apply the new rule at the relevant sites self._apply_rule(rule, test_sents) # Update _tag_positions[rule.original_tag] and # _tag_positions[rule.replacement_tag] for the affected # positions (i.e., self._positions_by_rule[rule]). self._update_tag_positions(rule) # Update rules that were affected by the change. self._update_rules(rule, train_sents, test_sents) # The user can cancel training manually: except KeyboardInterrupt: print("Training stopped manually -- %d rules found" % len(rules)) # Discard our tag position mapping & rule mappings. self._clean() trainstats['finalerrors'] = trainstats['initialerrors'] - sum(trainstats['rulescores']) trainstats['finalacc'] = 1 - trainstats['finalerrors']/trainstats['tokencount'] # Create and return a tagger from the rules we found. return BrillTagger(self._initial_tagger, rules, trainstats) def _init_mappings(self, test_sents, train_sents): """ Initialize the tag position mapping & the rule related mappings. For each error in test_sents, find new rules that would correct them, and add them to the rule mappings. """ self._tag_positions = defaultdict(list) self._rules_by_position = defaultdict(set) self._positions_by_rule = defaultdict(dict) self._rules_by_score = defaultdict(set) self._rule_scores = defaultdict(int) self._first_unknown_position = defaultdict(int) # Scan through the corpus, initializing the tag_positions # mapping and all the rule-related mappings. for sentnum, sent in enumerate(test_sents): for wordnum, (word, tag) in enumerate(sent): # Initialize tag_positions self._tag_positions[tag].append((sentnum, wordnum)) # If it's an error token, update the rule-related mappings. correct_tag = train_sents[sentnum][wordnum][1] if tag != correct_tag: for rule in self._find_rules(sent, wordnum, correct_tag): self._update_rule_applies(rule, sentnum, wordnum, train_sents) def _clean(self): self._tag_positions = None self._rules_by_position = None self._positions_by_rule = None self._rules_by_score = None self._rule_scores = None self._first_unknown_position = None def _find_rules(self, sent, wordnum, new_tag): """ Use the templates to find rules that apply at index *wordnum* in the sentence *sent* and generate the tag *new_tag*. """ for template in self._templates: for rule in template.applicable_rules(sent, wordnum, new_tag): yield rule def _update_rule_applies(self, rule, sentnum, wordnum, train_sents): """ Update the rule data tables to reflect the fact that *rule* applies at the position *(sentnum, wordnum)*. """ pos = sentnum, wordnum # If the rule is already known to apply here, ignore. # (This only happens if the position's tag hasn't changed.) if pos in self._positions_by_rule[rule]: return # Update self._positions_by_rule. correct_tag = train_sents[sentnum][wordnum][1] if rule.replacement_tag == correct_tag: self._positions_by_rule[rule][pos] = 1 elif rule.original_tag == correct_tag: self._positions_by_rule[rule][pos] = -1 else: # was wrong, remains wrong self._positions_by_rule[rule][pos] = 0 # Update _rules_by_position self._rules_by_position[pos].add(rule) # Update _rule_scores. old_score = self._rule_scores[rule] self._rule_scores[rule] += self._positions_by_rule[rule][pos] # Update _rules_by_score. self._rules_by_score[old_score].discard(rule) self._rules_by_score[self._rule_scores[rule]].add(rule) def _update_rule_not_applies(self, rule, sentnum, wordnum): """ Update the rule data tables to reflect the fact that *rule* does not apply at the position *(sentnum, wordnum)*. """ pos = sentnum, wordnum # Update _rule_scores. old_score = self._rule_scores[rule] self._rule_scores[rule] -= self._positions_by_rule[rule][pos] # Update _rules_by_score. self._rules_by_score[old_score].discard(rule) self._rules_by_score[self._rule_scores[rule]].add(rule) # Update _positions_by_rule del self._positions_by_rule[rule][pos] self._rules_by_position[pos].remove(rule) # Optional addition: if the rule now applies nowhere, delete # all its dictionary entries. def _best_rule(self, train_sents, test_sents, min_score, min_acc): """ Find the next best rule. This is done by repeatedly taking a rule with the highest score and stepping through the corpus to see where it applies. When it makes an error (decreasing its score) it's bumped down, and we try a new rule with the highest score. When we find a rule which has the highest score *and* which has been tested against the entire corpus, we can conclude that it's the next best rule. """ for max_score in sorted(self._rules_by_score.keys(), reverse=True): if len(self._rules_by_score) == 0: return None if max_score < min_score or max_score <= 0: return None best_rules = list(self._rules_by_score[max_score]) if self._deterministic: best_rules.sort(key=repr) for rule in best_rules: positions = self._tag_positions[rule.original_tag] unk = self._first_unknown_position.get(rule, (0, -1)) start = bisect.bisect_left(positions, unk) for i in range(start, len(positions)): sentnum, wordnum = positions[i] if rule.applies(test_sents[sentnum], wordnum): self._update_rule_applies(rule, sentnum, wordnum, train_sents) if self._rule_scores[rule] < max_score: self._first_unknown_position[rule] = (sentnum, wordnum+1) break # The update demoted the rule. if self._rule_scores[rule] == max_score: self._first_unknown_position[rule] = (len(train_sents) + 1, 0) # optimization: if no min_acc threshold given, don't bother computing accuracy if min_acc is None: return rule else: changes = self._positions_by_rule[rule].values() num_fixed = len([c for c in changes if c == 1]) num_broken = len([c for c in changes if c == -1]) # acc here is fixed/(fixed+broken); could also be # fixed/(fixed+broken+other) == num_fixed/len(changes) acc = num_fixed/(num_fixed+num_broken) if acc >= min_acc: return rule # else: rule too inaccurate, discard and try next # We demoted (or skipped due to < min_acc, if that was given) # all the rules with score==max_score. assert min_acc is not None or not self._rules_by_score[max_score] if not self._rules_by_score[max_score]: del self._rules_by_score[max_score] def _apply_rule(self, rule, test_sents): """ Update *test_sents* by applying *rule* everywhere where its conditions are met. """ update_positions = set(self._positions_by_rule[rule]) new_tag = rule.replacement_tag if self._trace > 3: self._trace_apply(len(update_positions)) # Update test_sents. for (sentnum, wordnum) in update_positions: text = test_sents[sentnum][wordnum][0] test_sents[sentnum][wordnum] = (text, new_tag) def _update_tag_positions(self, rule): """ Update _tag_positions to reflect the changes to tags that are made by *rule*. """ # Update the tag index. for pos in self._positions_by_rule[rule]: # Delete the old tag. old_tag_positions = self._tag_positions[rule.original_tag] old_index = bisect.bisect_left(old_tag_positions, pos) del old_tag_positions[old_index] # Insert the new tag. new_tag_positions = self._tag_positions[rule.replacement_tag] bisect.insort_left(new_tag_positions, pos) def _update_rules(self, rule, train_sents, test_sents): """ Check if we should add or remove any rules from consideration, given the changes made by *rule*. """ # Collect a list of all positions that might be affected. neighbors = set() for sentnum, wordnum in self._positions_by_rule[rule]: for template in self._templates: n = template.get_neighborhood(test_sents[sentnum], wordnum) neighbors.update([(sentnum, i) for i in n]) # Update the rules at each position. num_obsolete = num_new = num_unseen = 0 for sentnum, wordnum in neighbors: test_sent = test_sents[sentnum] correct_tag = train_sents[sentnum][wordnum][1] # Check if the change causes any rule at this position to # stop matching; if so, then update our rule mappings # accordingly. old_rules = set(self._rules_by_position[sentnum, wordnum]) for old_rule in old_rules: if not old_rule.applies(test_sent, wordnum): num_obsolete += 1 self._update_rule_not_applies(old_rule, sentnum, wordnum) # Check if the change causes our templates to propose any # new rules for this position. for template in self._templates: for new_rule in template.applicable_rules(test_sent, wordnum, correct_tag): if new_rule not in old_rules: num_new += 1 if new_rule not in self._rule_scores: num_unseen += 1 old_rules.add(new_rule) self._update_rule_applies(new_rule, sentnum, wordnum, train_sents) # We may have caused other rules to match here, that are # not proposed by our templates -- in particular, rules # that are harmful or neutral. We therefore need to # update any rule whose first_unknown_position is past # this rule. for new_rule, pos in self._first_unknown_position.items(): if pos > (sentnum, wordnum): if new_rule not in old_rules: num_new += 1 if new_rule.applies(test_sent, wordnum): self._update_rule_applies(new_rule, sentnum, wordnum, train_sents) if self._trace > 3: self._trace_update_rules(num_obsolete, num_new, num_unseen) # Tracing def _trace_header(self): print(""" B | S F r O | Score = Fixed - Broken c i o t | R Fixed = num tags changed incorrect -> correct o x k h | u Broken = num tags changed correct -> incorrect r e e e | l Other = num tags changed incorrect -> incorrect e d n r | e ------------------+------------------------------------------------------- """.rstrip()) def _trace_rule(self, rule): assert self._rule_scores[rule] == sum(self._positions_by_rule[rule].values()) changes = self._positions_by_rule[rule].values() num_fixed = len([c for c in changes if c == 1]) num_broken = len([c for c in changes if c == -1]) num_other = len([c for c in changes if c == 0]) score = self._rule_scores[rule] rulestr = rule.format(self._ruleformat) if self._trace > 2: print('%4d%4d%4d%4d |' % (score, num_fixed, num_broken, num_other), end=' ') print(textwrap.fill(rulestr, initial_indent=' '*20, width=79, subsequent_indent=' '*18+'| ').strip()) else: print(rulestr) def _trace_apply(self, num_updates): prefix = ' '*18+'|' print(prefix) print(prefix, 'Applying rule to %d positions.' % num_updates) def _trace_update_rules(self, num_obsolete, num_new, num_unseen): prefix = ' '*18+'|' print(prefix, 'Updated rule tables:') print(prefix, (' - %d rule applications removed' % num_obsolete)) print(prefix, (' - %d rule applications added (%d novel)' % (num_new, num_unseen))) print(prefix) nltk-3.1/nltk/tag/crf.py0000644000076500000240000001761612607224144014740 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Interface to the CRFSuite Tagger # # Copyright (C) 2001-2015 NLTK Project # Author: Long Duong # URL: # For license information, see LICENSE.TXT """ A module for POS tagging using CRFSuite """ from __future__ import absolute_import from __future__ import unicode_literals import unicodedata import re from nltk.tag.api import TaggerI try: import pycrfsuite except ImportError: pass class CRFTagger(TaggerI): """ A module for POS tagging using CRFSuite https://pypi.python.org/pypi/python-crfsuite >>> from nltk.tag import CRFTagger >>> ct = CRFTagger() >>> train_data = [[('University','Noun'), ('is','Verb'), ('a','Det'), ('good','Adj'), ('place','Noun')], ... [('dog','Noun'),('eat','Verb'),('meat','Noun')]] >>> ct.train(train_data,'model.crf.tagger') >>> ct.tag_sents([['dog','is','good'], ['Cat','eat','meat']]) [[('dog', 'Noun'), ('is', 'Verb'), ('good', 'Adj')], [('Cat', 'Noun'), ('eat', 'Verb'), ('meat', 'Noun')]] >>> gold_sentences = [[('dog','Noun'),('is','Verb'),('good','Adj')] , [('Cat','Noun'),('eat','Verb'), ('meat','Noun')]] >>> ct.evaluate(gold_sentences) 1.0 Setting learned model file >>> ct = CRFTagger() >>> ct.set_model_file('model.crf.tagger') >>> ct.evaluate(gold_sentences) 1.0 """ def __init__(self, feature_func = None, verbose = False, training_opt = {}): """ Initialize the CRFSuite tagger :param feature_func: The function that extracts features for each token of a sentence. This function should take 2 parameters: tokens and index which extract features at index position from tokens list. See the build in _get_features function for more detail. :param verbose: output the debugging messages during training. :type verbose: boolean :param training_opt: python-crfsuite training options :type training_opt : dictionary Set of possible training options (using LBFGS training algorithm). 'feature.minfreq' : The minimum frequency of features. 'feature.possible_states' : Force to generate possible state features. 'feature.possible_transitions' : Force to generate possible transition features. 'c1' : Coefficient for L1 regularization. 'c2' : Coefficient for L2 regularization. 'max_iterations' : The maximum number of iterations for L-BFGS optimization. 'num_memories' : The number of limited memories for approximating the inverse hessian matrix. 'epsilon' : Epsilon for testing the convergence of the objective. 'period' : The duration of iterations to test the stopping criterion. 'delta' : The threshold for the stopping criterion; an L-BFGS iteration stops when the improvement of the log likelihood over the last ${period} iterations is no greater than this threshold. 'linesearch' : The line search algorithm used in L-BFGS updates: { 'MoreThuente': More and Thuente's method, 'Backtracking': Backtracking method with regular Wolfe condition, 'StrongBacktracking': Backtracking method with strong Wolfe condition } 'max_linesearch' : The maximum number of trials for the line search algorithm. """ self._model_file = '' self._tagger = pycrfsuite.Tagger() if feature_func is None: self._feature_func = self._get_features else: self._feature_func = feature_func self._verbose = verbose self._training_options = training_opt self._pattern = re.compile(r'\d') def set_model_file(self, model_file): self._model_file = model_file self._tagger.open(self._model_file) def _get_features(self, tokens, idx): """ Extract basic features about this word including - Current Word - Is Capitalized ? - Has Punctuation ? - Has Number ? - Suffixes up to length 3 Note that : we might include feature over previous word, next word ect. :return : a list which contains the features :rtype : list(str) """ token = tokens[idx] feature_list = [] # Capitalization if token[0].isupper(): feature_list.append('CAPITALIZATION') # Number if re.search(self._pattern, token) is not None: feature_list.append('HAS_NUM') # Punctuation punc_cat = set(["Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po"]) if all (unicodedata.category(x) in punc_cat for x in token): feature_list.append('PUNCTUATION') # Suffix up to length 3 if len(token) > 1: feature_list.append('SUF_' + token[-1:]) if len(token) > 2: feature_list.append('SUF_' + token[-2:]) if len(token) > 3: feature_list.append('SUF_' + token[-3:]) feature_list.append('WORD_' + token ) return feature_list def tag_sents(self, sents): ''' Tag a list of sentences. NB before using this function, user should specify the mode_file either by - Train a new model using ``train'' function - Use the pre-trained model which is set via ``set_model_file'' function :params sentences : list of sentences needed to tag. :type sentences : list(list(str)) :return : list of tagged sentences. :rtype : list (list (tuple(str,str))) ''' if self._model_file == '': raise Exception(' No model file is found !! Please use train or set_model_file function') # We need the list of sentences instead of the list generator for matching the input and output result = [] for tokens in sents: features = [self._feature_func(tokens,i) for i in range(len(tokens))] labels = self._tagger.tag(features) if len(labels) != len(tokens): raise Exception(' Predicted Length Not Matched, Expect Errors !') tagged_sent = list(zip(tokens,labels)) result.append(tagged_sent) return result def train(self, train_data, model_file): ''' Train the CRF tagger using CRFSuite :params train_data : is the list of annotated sentences. :type train_data : list (list(tuple(str,str))) :params model_file : the model will be saved to this file. ''' trainer = pycrfsuite.Trainer(verbose=self._verbose) trainer.set_params(self._training_options) for sent in train_data: tokens,labels = zip(*sent) features = [self._feature_func(tokens,i) for i in range(len(tokens))] trainer.append(features,labels) # Now train the model, the output should be model_file trainer.train(model_file) # Save the model file self.set_model_file(model_file) def tag(self, tokens): ''' Tag a sentence using Python CRFSuite Tagger. NB before using this function, user should specify the mode_file either by - Train a new model using ``train'' function - Use the pre-trained model which is set via ``set_model_file'' function :params tokens : list of tokens needed to tag. :type tokens : list(str) :return : list of tagged tokens. :rtype : list (tuple(str,str)) ''' return self.tag_sents([tokens])[0] nltk-3.1/nltk/tag/hmm.py0000644000076500000240000014014512607224144014741 0ustar sbstaff00000000000000# Natural Language Toolkit: Hidden Markov Model # # Copyright (C) 2001-2015 NLTK Project # Author: Trevor Cohn # Philip Blunsom # Tiago Tresoldi (fixes) # Steven Bird (fixes) # Joseph Frazee (fixes) # Steven Xu (fixes) # URL: # For license information, see LICENSE.TXT """ Hidden Markov Models (HMMs) largely used to assign the correct label sequence to sequential data or assess the probability of a given label and data sequence. These models are finite state machines characterised by a number of states, transitions between these states, and output symbols emitted while in each state. The HMM is an extension to the Markov chain, where each state corresponds deterministically to a given event. In the HMM the observation is a probabilistic function of the state. HMMs share the Markov chain's assumption, being that the probability of transition from one state to another only depends on the current state - i.e. the series of states that led to the current state are not used. They are also time invariant. The HMM is a directed graph, with probability weighted edges (representing the probability of a transition between the source and sink states) where each vertex emits an output symbol when entered. The symbol (or observation) is non-deterministically generated. For this reason, knowing that a sequence of output observations was generated by a given HMM does not mean that the corresponding sequence of states (and what the current state is) is known. This is the 'hidden' in the hidden markov model. Formally, a HMM can be characterised by: - the output observation alphabet. This is the set of symbols which may be observed as output of the system. - the set of states. - the transition probabilities *a_{ij} = P(s_t = j | s_{t-1} = i)*. These represent the probability of transition to each state from a given state. - the output probability matrix *b_i(k) = P(X_t = o_k | s_t = i)*. These represent the probability of observing each symbol in a given state. - the initial state distribution. This gives the probability of starting in each state. To ground this discussion, take a common NLP application, part-of-speech (POS) tagging. An HMM is desirable for this task as the highest probability tag sequence can be calculated for a given sequence of word forms. This differs from other tagging techniques which often tag each word individually, seeking to optimise each individual tagging greedily without regard to the optimal combination of tags for a larger unit, such as a sentence. The HMM does this with the Viterbi algorithm, which efficiently computes the optimal path through the graph given the sequence of words forms. In POS tagging the states usually have a 1:1 correspondence with the tag alphabet - i.e. each state represents a single tag. The output observation alphabet is the set of word forms (the lexicon), and the remaining three parameters are derived by a training regime. With this information the probability of a given sentence can be easily derived, by simply summing the probability of each distinct path through the model. Similarly, the highest probability tagging sequence can be derived with the Viterbi algorithm, yielding a state sequence which can be mapped into a tag sequence. This discussion assumes that the HMM has been trained. This is probably the most difficult task with the model, and requires either MLE estimates of the parameters or unsupervised learning using the Baum-Welch algorithm, a variant of EM. For more information, please consult the source code for this module, which includes extensive demonstration code. """ from __future__ import print_function, unicode_literals, division import re import itertools try: import numpy as np except ImportError: pass from nltk.probability import (FreqDist, ConditionalFreqDist, ConditionalProbDist, DictionaryProbDist, DictionaryConditionalProbDist, LidstoneProbDist, MutableProbDist, MLEProbDist, RandomProbDist) from nltk.metrics import accuracy from nltk.util import LazyMap, unique_list from nltk.compat import python_2_unicode_compatible, izip, imap from nltk.tag.api import TaggerI _TEXT = 0 # index of text in a tuple _TAG = 1 # index of tag in a tuple def _identity(labeled_symbols): return labeled_symbols @python_2_unicode_compatible class HiddenMarkovModelTagger(TaggerI): """ Hidden Markov model class, a generative model for labelling sequence data. These models define the joint probability of a sequence of symbols and their labels (state transitions) as the product of the starting state probability, the probability of each state transition, and the probability of each observation being generated from each state. This is described in more detail in the module documentation. This implementation is based on the HMM description in Chapter 8, Huang, Acero and Hon, Spoken Language Processing and includes an extension for training shallow HMM parsers or specialized HMMs as in Molina et. al, 2002. A specialized HMM modifies training data by applying a specialization function to create a new training set that is more appropriate for sequential tagging with an HMM. A typical use case is chunking. :param symbols: the set of output symbols (alphabet) :type symbols: seq of any :param states: a set of states representing state space :type states: seq of any :param transitions: transition probabilities; Pr(s_i | s_j) is the probability of transition from state i given the model is in state_j :type transitions: ConditionalProbDistI :param outputs: output probabilities; Pr(o_k | s_i) is the probability of emitting symbol k when entering state i :type outputs: ConditionalProbDistI :param priors: initial state distribution; Pr(s_i) is the probability of starting in state i :type priors: ProbDistI :param transform: an optional function for transforming training instances, defaults to the identity function. :type transform: callable """ def __init__(self, symbols, states, transitions, outputs, priors, transform=_identity): self._symbols = unique_list(symbols) self._states = unique_list(states) self._transitions = transitions self._outputs = outputs self._priors = priors self._cache = None self._transform = transform @classmethod def _train(cls, labeled_sequence, test_sequence=None, unlabeled_sequence=None, transform=_identity, estimator=None, **kwargs): if estimator is None: def estimator(fd, bins): return LidstoneProbDist(fd, 0.1, bins) labeled_sequence = LazyMap(transform, labeled_sequence) symbols = unique_list(word for sent in labeled_sequence for word, tag in sent) tag_set = unique_list(tag for sent in labeled_sequence for word, tag in sent) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(labeled_sequence, estimator=estimator) hmm = cls(hmm._symbols, hmm._states, hmm._transitions, hmm._outputs, hmm._priors, transform=transform) if test_sequence: hmm.test(test_sequence, verbose=kwargs.get('verbose', False)) if unlabeled_sequence: max_iterations = kwargs.get('max_iterations', 5) hmm = trainer.train_unsupervised(unlabeled_sequence, model=hmm, max_iterations=max_iterations) if test_sequence: hmm.test(test_sequence, verbose=kwargs.get('verbose', False)) return hmm @classmethod def train(cls, labeled_sequence, test_sequence=None, unlabeled_sequence=None, **kwargs): """ Train a new HiddenMarkovModelTagger using the given labeled and unlabeled training instances. Testing will be performed if test instances are provided. :return: a hidden markov model tagger :rtype: HiddenMarkovModelTagger :param labeled_sequence: a sequence of labeled training instances, i.e. a list of sentences represented as tuples :type labeled_sequence: list(list) :param test_sequence: a sequence of labeled test instances :type test_sequence: list(list) :param unlabeled_sequence: a sequence of unlabeled training instances, i.e. a list of sentences represented as words :type unlabeled_sequence: list(list) :param transform: an optional function for transforming training instances, defaults to the identity function, see ``transform()`` :type transform: function :param estimator: an optional function or class that maps a condition's frequency distribution to its probability distribution, defaults to a Lidstone distribution with gamma = 0.1 :type estimator: class or function :param verbose: boolean flag indicating whether training should be verbose or include printed output :type verbose: bool :param max_iterations: number of Baum-Welch interations to perform :type max_iterations: int """ return cls._train(labeled_sequence, test_sequence, unlabeled_sequence, **kwargs) def probability(self, sequence): """ Returns the probability of the given symbol sequence. If the sequence is labelled, then returns the joint probability of the symbol, state sequence. Otherwise, uses the forward algorithm to find the probability over all label sequences. :return: the probability of the sequence :rtype: float :param sequence: the sequence of symbols which must contain the TEXT property, and optionally the TAG property :type sequence: Token """ return 2**(self.log_probability(self._transform(sequence))) def log_probability(self, sequence): """ Returns the log-probability of the given symbol sequence. If the sequence is labelled, then returns the joint log-probability of the symbol, state sequence. Otherwise, uses the forward algorithm to find the log-probability over all label sequences. :return: the log-probability of the sequence :rtype: float :param sequence: the sequence of symbols which must contain the TEXT property, and optionally the TAG property :type sequence: Token """ sequence = self._transform(sequence) T = len(sequence) if T > 0 and sequence[0][_TAG]: last_state = sequence[0][_TAG] p = self._priors.logprob(last_state) + \ self._output_logprob(last_state, sequence[0][_TEXT]) for t in range(1, T): state = sequence[t][_TAG] p += self._transitions[last_state].logprob(state) + \ self._output_logprob(state, sequence[t][_TEXT]) last_state = state return p else: alpha = self._forward_probability(sequence) p = logsumexp2(alpha[T-1]) return p def tag(self, unlabeled_sequence): """ Tags the sequence with the highest probability state sequence. This uses the best_path method to find the Viterbi path. :return: a labelled sequence of symbols :rtype: list :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list """ unlabeled_sequence = self._transform(unlabeled_sequence) return self._tag(unlabeled_sequence) def _tag(self, unlabeled_sequence): path = self._best_path(unlabeled_sequence) return list(izip(unlabeled_sequence, path)) def _output_logprob(self, state, symbol): """ :return: the log probability of the symbol being observed in the given state :rtype: float """ return self._outputs[state].logprob(symbol) def _create_cache(self): """ The cache is a tuple (P, O, X, S) where: - S maps symbols to integers. I.e., it is the inverse mapping from self._symbols; for each symbol s in self._symbols, the following is true:: self._symbols[S[s]] == s - O is the log output probabilities:: O[i,k] = log( P(token[t]=sym[k]|tag[t]=state[i]) ) - X is the log transition probabilities:: X[i,j] = log( P(tag[t]=state[j]|tag[t-1]=state[i]) ) - P is the log prior probabilities:: P[i] = log( P(tag[0]=state[i]) ) """ if not self._cache: N = len(self._states) M = len(self._symbols) P = np.zeros(N, np.float32) X = np.zeros((N, N), np.float32) O = np.zeros((N, M), np.float32) for i in range(N): si = self._states[i] P[i] = self._priors.logprob(si) for j in range(N): X[i, j] = self._transitions[si].logprob(self._states[j]) for k in range(M): O[i, k] = self._output_logprob(si, self._symbols[k]) S = {} for k in range(M): S[self._symbols[k]] = k self._cache = (P, O, X, S) def _update_cache(self, symbols): # add new symbols to the symbol table and repopulate the output # probabilities and symbol table mapping if symbols: self._create_cache() P, O, X, S = self._cache for symbol in symbols: if symbol not in self._symbols: self._cache = None self._symbols.append(symbol) # don't bother with the work if there aren't any new symbols if not self._cache: N = len(self._states) M = len(self._symbols) Q = O.shape[1] # add new columns to the output probability table without # destroying the old probabilities O = np.hstack([O, np.zeros((N, M - Q), np.float32)]) for i in range(N): si = self._states[i] # only calculate probabilities for new symbols for k in range(Q, M): O[i, k] = self._output_logprob(si, self._symbols[k]) # only create symbol mappings for new symbols for k in range(Q, M): S[self._symbols[k]] = k self._cache = (P, O, X, S) def reset_cache(self): self._cache = None def best_path(self, unlabeled_sequence): """ Returns the state sequence of the optimal (most probable) path through the HMM. Uses the Viterbi algorithm to calculate this part by dynamic programming. :return: the state sequence :rtype: sequence of any :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list """ unlabeled_sequence = self._transform(unlabeled_sequence) return self._best_path(unlabeled_sequence) def _best_path(self, unlabeled_sequence): T = len(unlabeled_sequence) N = len(self._states) self._create_cache() self._update_cache(unlabeled_sequence) P, O, X, S = self._cache V = np.zeros((T, N), np.float32) B = -np.ones((T, N), np.int) V[0] = P + O[:, S[unlabeled_sequence[0]]] for t in range(1, T): for j in range(N): vs = V[t-1, :] + X[:, j] best = np.argmax(vs) V[t, j] = vs[best] + O[j, S[unlabeled_sequence[t]]] B[t, j] = best current = np.argmax(V[T-1,:]) sequence = [current] for t in range(T-1, 0, -1): last = B[t, current] sequence.append(last) current = last sequence.reverse() return list(map(self._states.__getitem__, sequence)) def best_path_simple(self, unlabeled_sequence): """ Returns the state sequence of the optimal (most probable) path through the HMM. Uses the Viterbi algorithm to calculate this part by dynamic programming. This uses a simple, direct method, and is included for teaching purposes. :return: the state sequence :rtype: sequence of any :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list """ unlabeled_sequence = self._transform(unlabeled_sequence) return self._best_path_simple(unlabeled_sequence) def _best_path_simple(self, unlabeled_sequence): T = len(unlabeled_sequence) N = len(self._states) V = np.zeros((T, N), np.float64) B = {} # find the starting log probabilities for each state symbol = unlabeled_sequence[0] for i, state in enumerate(self._states): V[0, i] = self._priors.logprob(state) + \ self._output_logprob(state, symbol) B[0, state] = None # find the maximum log probabilities for reaching each state at time t for t in range(1, T): symbol = unlabeled_sequence[t] for j in range(N): sj = self._states[j] best = None for i in range(N): si = self._states[i] va = V[t-1, i] + self._transitions[si].logprob(sj) if not best or va > best[0]: best = (va, si) V[t, j] = best[0] + self._output_logprob(sj, symbol) B[t, sj] = best[1] # find the highest probability final state best = None for i in range(N): val = V[T-1, i] if not best or val > best[0]: best = (val, self._states[i]) # traverse the back-pointers B to find the state sequence current = best[1] sequence = [current] for t in range(T-1, 0, -1): last = B[t, current] sequence.append(last) current = last sequence.reverse() return sequence def random_sample(self, rng, length): """ Randomly sample the HMM to generate a sentence of a given length. This samples the prior distribution then the observation distribution and transition distribution for each subsequent observation and state. This will mostly generate unintelligible garbage, but can provide some amusement. :return: the randomly created state/observation sequence, generated according to the HMM's probability distributions. The SUBTOKENS have TEXT and TAG properties containing the observation and state respectively. :rtype: list :param rng: random number generator :type rng: Random (or any object with a random() method) :param length: desired output length :type length: int """ # sample the starting state and symbol prob dists tokens = [] state = self._sample_probdist(self._priors, rng.random(), self._states) symbol = self._sample_probdist(self._outputs[state], rng.random(), self._symbols) tokens.append((symbol, state)) for i in range(1, length): # sample the state transition and symbol prob dists state = self._sample_probdist(self._transitions[state], rng.random(), self._states) symbol = self._sample_probdist(self._outputs[state], rng.random(), self._symbols) tokens.append((symbol, state)) return tokens def _sample_probdist(self, probdist, p, samples): cum_p = 0 for sample in samples: add_p = probdist.prob(sample) if cum_p <= p <= cum_p + add_p: return sample cum_p += add_p raise Exception('Invalid probability distribution - ' 'does not sum to one') def entropy(self, unlabeled_sequence): """ Returns the entropy over labellings of the given sequence. This is given by:: H(O) = - sum_S Pr(S | O) log Pr(S | O) where the summation ranges over all state sequences, S. Let *Z = Pr(O) = sum_S Pr(S, O)}* where the summation ranges over all state sequences and O is the observation sequence. As such the entropy can be re-expressed as:: H = - sum_S Pr(S | O) log [ Pr(S, O) / Z ] = log Z - sum_S Pr(S | O) log Pr(S, 0) = log Z - sum_S Pr(S | O) [ log Pr(S_0) + sum_t Pr(S_t | S_{t-1}) + sum_t Pr(O_t | S_t) ] The order of summation for the log terms can be flipped, allowing dynamic programming to be used to calculate the entropy. Specifically, we use the forward and backward probabilities (alpha, beta) giving:: H = log Z - sum_s0 alpha_0(s0) beta_0(s0) / Z * log Pr(s0) + sum_t,si,sj alpha_t(si) Pr(sj | si) Pr(O_t+1 | sj) beta_t(sj) / Z * log Pr(sj | si) + sum_t,st alpha_t(st) beta_t(st) / Z * log Pr(O_t | st) This simply uses alpha and beta to find the probabilities of partial sequences, constrained to include the given state(s) at some point in time. """ unlabeled_sequence = self._transform(unlabeled_sequence) T = len(unlabeled_sequence) N = len(self._states) alpha = self._forward_probability(unlabeled_sequence) beta = self._backward_probability(unlabeled_sequence) normalisation = logsumexp2(alpha[T-1]) entropy = normalisation # starting state, t = 0 for i, state in enumerate(self._states): p = 2**(alpha[0, i] + beta[0, i] - normalisation) entropy -= p * self._priors.logprob(state) #print 'p(s_0 = %s) =' % state, p # state transitions for t0 in range(T - 1): t1 = t0 + 1 for i0, s0 in enumerate(self._states): for i1, s1 in enumerate(self._states): p = 2**(alpha[t0, i0] + self._transitions[s0].logprob(s1) + self._outputs[s1].logprob( unlabeled_sequence[t1][_TEXT]) + beta[t1, i1] - normalisation) entropy -= p * self._transitions[s0].logprob(s1) #print 'p(s_%d = %s, s_%d = %s) =' % (t0, s0, t1, s1), p # symbol emissions for t in range(T): for i, state in enumerate(self._states): p = 2**(alpha[t, i] + beta[t, i] - normalisation) entropy -= p * self._outputs[state].logprob( unlabeled_sequence[t][_TEXT]) #print 'p(s_%d = %s) =' % (t, state), p return entropy def point_entropy(self, unlabeled_sequence): """ Returns the pointwise entropy over the possible states at each position in the chain, given the observation sequence. """ unlabeled_sequence = self._transform(unlabeled_sequence) T = len(unlabeled_sequence) N = len(self._states) alpha = self._forward_probability(unlabeled_sequence) beta = self._backward_probability(unlabeled_sequence) normalisation = logsumexp2(alpha[T-1]) entropies = np.zeros(T, np.float64) probs = np.zeros(N, np.float64) for t in range(T): for s in range(N): probs[s] = alpha[t, s] + beta[t, s] - normalisation for s in range(N): entropies[t] -= 2**(probs[s]) * probs[s] return entropies def _exhaustive_entropy(self, unlabeled_sequence): unlabeled_sequence = self._transform(unlabeled_sequence) T = len(unlabeled_sequence) N = len(self._states) labellings = [[state] for state in self._states] for t in range(T - 1): current = labellings labellings = [] for labelling in current: for state in self._states: labellings.append(labelling + [state]) log_probs = [] for labelling in labellings: labeled_sequence = unlabeled_sequence[:] for t, label in enumerate(labelling): labeled_sequence[t] = (labeled_sequence[t][_TEXT], label) lp = self.log_probability(labeled_sequence) log_probs.append(lp) normalisation = _log_add(*log_probs) #ps = zeros((T, N), float64) #for labelling, lp in zip(labellings, log_probs): #for t in range(T): #ps[t, self._states.index(labelling[t])] += \ # 2**(lp - normalisation) #for t in range(T): #print 'prob[%d] =' % t, ps[t] entropy = 0 for lp in log_probs: lp -= normalisation entropy -= 2**(lp) * lp return entropy def _exhaustive_point_entropy(self, unlabeled_sequence): unlabeled_sequence = self._transform(unlabeled_sequence) T = len(unlabeled_sequence) N = len(self._states) labellings = [[state] for state in self._states] for t in range(T - 1): current = labellings labellings = [] for labelling in current: for state in self._states: labellings.append(labelling + [state]) log_probs = [] for labelling in labellings: labelled_sequence = unlabeled_sequence[:] for t, label in enumerate(labelling): labelled_sequence[t] = (labelled_sequence[t][_TEXT], label) lp = self.log_probability(labelled_sequence) log_probs.append(lp) normalisation = _log_add(*log_probs) probabilities = _ninf_array((T,N)) for labelling, lp in zip(labellings, log_probs): lp -= normalisation for t, label in enumerate(labelling): index = self._states.index(label) probabilities[t, index] = _log_add(probabilities[t, index], lp) entropies = np.zeros(T, np.float64) for t in range(T): for s in range(N): entropies[t] -= 2**(probabilities[t, s]) * probabilities[t, s] return entropies def _transitions_matrix(self): """ Return a matrix of transition log probabilities. """ trans_iter = (self._transitions[sj].logprob(si) for sj in self._states for si in self._states) transitions_logprob = np.fromiter(trans_iter, dtype=np.float64) N = len(self._states) return transitions_logprob.reshape((N, N)).T def _outputs_vector(self, symbol): """ Return a vector with log probabilities of emitting a symbol when entering states. """ out_iter = (self._output_logprob(sj, symbol) for sj in self._states) return np.fromiter(out_iter, dtype=np.float64) def _forward_probability(self, unlabeled_sequence): """ Return the forward probability matrix, a T by N array of log-probabilities, where T is the length of the sequence and N is the number of states. Each entry (t, s) gives the probability of being in state s at time t after observing the partial symbol sequence up to and including t. :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list :return: the forward log probability matrix :rtype: array """ T = len(unlabeled_sequence) N = len(self._states) alpha = _ninf_array((T, N)) transitions_logprob = self._transitions_matrix() # Initialization symbol = unlabeled_sequence[0][_TEXT] for i, state in enumerate(self._states): alpha[0, i] = self._priors.logprob(state) + \ self._output_logprob(state, symbol) # Induction for t in range(1, T): symbol = unlabeled_sequence[t][_TEXT] output_logprob = self._outputs_vector(symbol) for i in range(N): summand = alpha[t-1] + transitions_logprob[i] alpha[t, i] = logsumexp2(summand) + output_logprob[i] return alpha def _backward_probability(self, unlabeled_sequence): """ Return the backward probability matrix, a T by N array of log-probabilities, where T is the length of the sequence and N is the number of states. Each entry (t, s) gives the probability of being in state s at time t after observing the partial symbol sequence from t .. T. :return: the backward log probability matrix :rtype: array :param unlabeled_sequence: the sequence of unlabeled symbols :type unlabeled_sequence: list """ T = len(unlabeled_sequence) N = len(self._states) beta = _ninf_array((T, N)) transitions_logprob = self._transitions_matrix().T # initialise the backward values; # "1" is an arbitrarily chosen value from Rabiner tutorial beta[T-1, :] = np.log2(1) # inductively calculate remaining backward values for t in range(T-2, -1, -1): symbol = unlabeled_sequence[t+1][_TEXT] outputs = self._outputs_vector(symbol) for i in range(N): summand = transitions_logprob[i] + beta[t+1] + outputs beta[t, i] = logsumexp2(summand) return beta def test(self, test_sequence, verbose=False, **kwargs): """ Tests the HiddenMarkovModelTagger instance. :param test_sequence: a sequence of labeled test instances :type test_sequence: list(list) :param verbose: boolean flag indicating whether training should be verbose or include printed output :type verbose: bool """ def words(sent): return [word for (word, tag) in sent] def tags(sent): return [tag for (word, tag) in sent] def flatten(seq): return list(itertools.chain(*seq)) test_sequence = self._transform(test_sequence) predicted_sequence = list(imap(self._tag, imap(words, test_sequence))) if verbose: for test_sent, predicted_sent in izip(test_sequence, predicted_sequence): print('Test:', ' '.join('%s/%s' % (token, tag) for (token, tag) in test_sent)) print() print('Untagged:', ' '.join("%s" % token for (token, tag) in test_sent)) print() print('HMM-tagged:', ' '.join('%s/%s' % (token, tag) for (token, tag) in predicted_sent)) print() print('Entropy:', self.entropy([(token, None) for (token, tag) in predicted_sent])) print() print('-' * 60) test_tags = flatten(imap(tags, test_sequence)) predicted_tags = flatten(imap(tags, predicted_sequence)) acc = accuracy(test_tags, predicted_tags) count = sum(len(sent) for sent in test_sequence) print('accuracy over %d tokens: %.2f' % (count, acc * 100)) def __repr__(self): return ('' % (len(self._states), len(self._symbols))) class HiddenMarkovModelTrainer(object): """ Algorithms for learning HMM parameters from training data. These include both supervised learning (MLE) and unsupervised learning (Baum-Welch). Creates an HMM trainer to induce an HMM with the given states and output symbol alphabet. A supervised and unsupervised training method may be used. If either of the states or symbols are not given, these may be derived from supervised training. :param states: the set of state labels :type states: sequence of any :param symbols: the set of observation symbols :type symbols: sequence of any """ def __init__(self, states=None, symbols=None): self._states = (states if states else []) self._symbols = (symbols if symbols else []) def train(self, labeled_sequences=None, unlabeled_sequences=None, **kwargs): """ Trains the HMM using both (or either of) supervised and unsupervised techniques. :return: the trained model :rtype: HiddenMarkovModelTagger :param labelled_sequences: the supervised training data, a set of labelled sequences of observations :type labelled_sequences: list :param unlabeled_sequences: the unsupervised training data, a set of sequences of observations :type unlabeled_sequences: list :param kwargs: additional arguments to pass to the training methods """ assert labeled_sequences or unlabeled_sequences model = None if labeled_sequences: model = self.train_supervised(labeled_sequences, **kwargs) if unlabeled_sequences: if model: kwargs['model'] = model model = self.train_unsupervised(unlabeled_sequences, **kwargs) return model def _baum_welch_step(self, sequence, model, symbol_to_number): N = len(model._states) M = len(model._symbols) T = len(sequence) # compute forward and backward probabilities alpha = model._forward_probability(sequence) beta = model._backward_probability(sequence) # find the log probability of the sequence lpk = logsumexp2(alpha[T-1]) A_numer = _ninf_array((N, N)) B_numer = _ninf_array((N, M)) A_denom = _ninf_array(N) B_denom = _ninf_array(N) transitions_logprob = model._transitions_matrix().T for t in range(T): symbol = sequence[t][_TEXT] # not found? FIXME next_symbol = None if t < T - 1: next_symbol = sequence[t+1][_TEXT] # not found? FIXME xi = symbol_to_number[symbol] next_outputs_logprob = model._outputs_vector(next_symbol) alpha_plus_beta = alpha[t] + beta[t] if t < T - 1: numer_add = transitions_logprob + next_outputs_logprob + \ beta[t+1] + alpha[t].reshape(N, 1) A_numer = np.logaddexp2(A_numer, numer_add) A_denom = np.logaddexp2(A_denom, alpha_plus_beta) else: B_denom = np.logaddexp2(A_denom, alpha_plus_beta) B_numer[:,xi] = np.logaddexp2(B_numer[:,xi], alpha_plus_beta) return lpk, A_numer, A_denom, B_numer, B_denom def train_unsupervised(self, unlabeled_sequences, update_outputs=True, **kwargs): """ Trains the HMM using the Baum-Welch algorithm to maximise the probability of the data sequence. This is a variant of the EM algorithm, and is unsupervised in that it doesn't need the state sequences for the symbols. The code is based on 'A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition', Lawrence Rabiner, IEEE, 1989. :return: the trained model :rtype: HiddenMarkovModelTagger :param unlabeled_sequences: the training data, a set of sequences of observations :type unlabeled_sequences: list kwargs may include following parameters: :param model: a HiddenMarkovModelTagger instance used to begin the Baum-Welch algorithm :param max_iterations: the maximum number of EM iterations :param convergence_logprob: the maximum change in log probability to allow convergence """ # create a uniform HMM, which will be iteratively refined, unless # given an existing model model = kwargs.get('model') if not model: priors = RandomProbDist(self._states) transitions = DictionaryConditionalProbDist( dict((state, RandomProbDist(self._states)) for state in self._states)) outputs = DictionaryConditionalProbDist( dict((state, RandomProbDist(self._symbols)) for state in self._states)) model = HiddenMarkovModelTagger(self._symbols, self._states, transitions, outputs, priors) self._states = model._states self._symbols = model._symbols N = len(self._states) M = len(self._symbols) symbol_numbers = dict((sym, i) for i, sym in enumerate(self._symbols)) # update model prob dists so that they can be modified # model._priors = MutableProbDist(model._priors, self._states) model._transitions = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._transitions[s], self._states)) for s in self._states)) if update_outputs: model._outputs = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._outputs[s], self._symbols)) for s in self._states)) model.reset_cache() # iterate until convergence converged = False last_logprob = None iteration = 0 max_iterations = kwargs.get('max_iterations', 1000) epsilon = kwargs.get('convergence_logprob', 1e-6) while not converged and iteration < max_iterations: A_numer = _ninf_array((N, N)) B_numer = _ninf_array((N, M)) A_denom = _ninf_array(N) B_denom = _ninf_array(N) logprob = 0 for sequence in unlabeled_sequences: sequence = list(sequence) if not sequence: continue (lpk, seq_A_numer, seq_A_denom, seq_B_numer, seq_B_denom) = self._baum_welch_step(sequence, model, symbol_numbers) # add these sums to the global A and B values for i in range(N): A_numer[i] = np.logaddexp2(A_numer[i], seq_A_numer[i]-lpk) B_numer[i] = np.logaddexp2(B_numer[i], seq_B_numer[i]-lpk) A_denom = np.logaddexp2(A_denom, seq_A_denom-lpk) B_denom = np.logaddexp2(B_denom, seq_B_denom-lpk) logprob += lpk # use the calculated values to update the transition and output # probability values for i in range(N): logprob_Ai = A_numer[i] - A_denom[i] logprob_Bi = B_numer[i] - B_denom[i] # We should normalize all probabilities (see p.391 Huang et al) # Let sum(P) be K. # We can divide each Pi by K to make sum(P) == 1. # Pi' = Pi/K # log2(Pi') = log2(Pi) - log2(K) logprob_Ai -= logsumexp2(logprob_Ai) logprob_Bi -= logsumexp2(logprob_Bi) # update output and transition probabilities si = self._states[i] for j in range(N): sj = self._states[j] model._transitions[si].update(sj, logprob_Ai[j]) if update_outputs: for k in range(M): ok = self._symbols[k] model._outputs[si].update(ok, logprob_Bi[k]) # Rabiner says the priors don't need to be updated. I don't # believe him. FIXME # test for convergence if iteration > 0 and abs(logprob - last_logprob) < epsilon: converged = True print('iteration', iteration, 'logprob', logprob) iteration += 1 last_logprob = logprob return model def train_supervised(self, labelled_sequences, estimator=None): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. :return: the trained model :rtype: HiddenMarkovModelTagger :param labelled_sequences: the training data, a set of labelled sequences of observations :type labelled_sequences: list :param estimator: a function taking a FreqDist and a number of bins and returning a CProbDistI; otherwise a MLE estimate is used """ # default to the MLE estimate if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = set(self._symbols) known_states = set(self._states) starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 outputs[state][symbol] += 1 lasts = state # update the state and symbol lists if state not in known_states: self._states.append(state) known_states.add(state) if symbol not in known_symbols: self._symbols.append(symbol) known_symbols.add(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, N) B = ConditionalProbDist(outputs, estimator, len(self._symbols)) return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi) def _ninf_array(shape): res = np.empty(shape, np.float64) res.fill(-np.inf) return res def logsumexp2(arr): max_ = arr.max() return np.log2(np.sum(2**(arr - max_))) + max_ def _log_add(*values): """ Adds the logged values, returning the logarithm of the addition. """ x = max(values) if x > -np.inf: sum_diffs = 0 for value in values: sum_diffs += 2**(value - x) return x + np.log2(sum_diffs) else: return x def _create_hmm_tagger(states, symbols, A, B, pi): def pd(values, samples): d = dict(zip(samples, values)) return DictionaryProbDist(d) def cpd(array, conditions, samples): d = {} for values, condition in zip(array, conditions): d[condition] = pd(values, samples) return DictionaryConditionalProbDist(d) A = cpd(A, states, states) B = cpd(B, states, symbols) pi = pd(pi, states) return HiddenMarkovModelTagger(symbols=symbols, states=states, transitions=A, outputs=B, priors=pi) def _market_hmm_example(): """ Return an example HMM (described at page 381, Huang et al) """ states = ['bull', 'bear', 'static'] symbols = ['up', 'down', 'unchanged'] A = np.array([[0.6, 0.2, 0.2], [0.5, 0.3, 0.2], [0.4, 0.1, 0.5]], np.float64) B = np.array([[0.7, 0.1, 0.2], [0.1, 0.6, 0.3], [0.3, 0.3, 0.4]], np.float64) pi = np.array([0.5, 0.2, 0.3], np.float64) model = _create_hmm_tagger(states, symbols, A, B, pi) return model, states, symbols def demo(): # demonstrates HMM probability calculation print() print("HMM probability calculation demo") print() model, states, symbols = _market_hmm_example() print('Testing', model) for test in [['up', 'up'], ['up', 'down', 'up'], ['down'] * 5, ['unchanged'] * 5 + ['up']]: sequence = [(t, None) for t in test] print('Testing with state sequence', test) print('probability =', model.probability(sequence)) print('tagging = ', model.tag([word for (word,tag) in sequence])) print('p(tagged) = ', model.probability(sequence)) print('H = ', model.entropy(sequence)) print('H_exh = ', model._exhaustive_entropy(sequence)) print('H(point) = ', model.point_entropy(sequence)) print('H_exh(point)=', model._exhaustive_point_entropy(sequence)) print() def load_pos(num_sents): from nltk.corpus import brown sentences = brown.tagged_sents(categories='news')[:num_sents] tag_re = re.compile(r'[*]|--|[^+*-]+') tag_set = set() symbols = set() cleaned_sentences = [] for sentence in sentences: for i in range(len(sentence)): word, tag = sentence[i] word = word.lower() # normalize symbols.add(word) # log this word # Clean up the tag. tag = tag_re.match(tag).group() tag_set.add(tag) sentence[i] = (word, tag) # store cleaned-up tagged token cleaned_sentences += [sentence] return cleaned_sentences, list(tag_set), list(symbols) def demo_pos(): # demonstrates POS tagging using supervised training print() print("HMM POS tagging demo") print() print('Training HMM...') labelled_sequences, tag_set, symbols = load_pos(20000) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(labelled_sequences[10:], estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print('Testing...') hmm.test(labelled_sequences[:10], verbose=True) def _untag(sentences): unlabeled = [] for sentence in sentences: unlabeled.append([(token[_TEXT], None) for token in sentence]) return unlabeled def demo_pos_bw(test=10, supervised=20, unsupervised=10, verbose=True, max_iterations=5): # demonstrates the Baum-Welch algorithm in POS tagging print() print("Baum-Welch demo for POS tagging") print() print('Training HMM (supervised, %d sentences)...' % supervised) sentences, tag_set, symbols = load_pos(test + supervised + unsupervised) symbols = set() for sentence in sentences: for token in sentence: symbols.add(token[_TEXT]) trainer = HiddenMarkovModelTrainer(tag_set, list(symbols)) hmm = trainer.train_supervised(sentences[test:test+supervised], estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) hmm.test(sentences[:test], verbose=verbose) print('Training (unsupervised, %d sentences)...' % unsupervised) # it's rather slow - so only use 10 samples by default unlabeled = _untag(sentences[test+supervised:]) hmm = trainer.train_unsupervised(unlabeled, model=hmm, max_iterations=max_iterations) hmm.test(sentences[:test], verbose=verbose) def demo_bw(): # demo Baum Welch by generating some sequences and then performing # unsupervised training on them print() print("Baum-Welch demo for market example") print() model, states, symbols = _market_hmm_example() # generate some random sequences training = [] import random rng = random.Random() rng.seed(0) for i in range(10): item = model.random_sample(rng, 5) training.append([(i[0], None) for i in item]) # train on those examples, starting with the model that generated them trainer = HiddenMarkovModelTrainer(states, symbols) hmm = trainer.train_unsupervised(training, model=model, max_iterations=1000) nltk-3.1/nltk/tag/hunpos.py0000644000076500000240000001166712607224144015502 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Interface to the HunPos POS-tagger # # Copyright (C) 2001-2015 NLTK Project # Author: Peter Ljunglöf # Dávid Márk Nemeskey (modifications) # Attila Zséder (modifications) # URL: # For license information, see LICENSE.TXT """ A module for interfacing with the HunPos open-source POS-tagger. """ import os from subprocess import Popen, PIPE from nltk.internals import find_binary, find_file from nltk.tag.api import TaggerI from nltk import compat _hunpos_url = 'http://code.google.com/p/hunpos/' _hunpos_charset = 'ISO-8859-1' """The default encoding used by hunpos: ISO-8859-1.""" class HunposTagger(TaggerI): """ A class for pos tagging with HunPos. The input is the paths to: - a model trained on training data - (optionally) the path to the hunpos-tag binary - (optionally) the encoding of the training data (default: ISO-8859-1) Example: >>> from nltk.tag import HunposTagger >>> ht = HunposTagger('en_wsj.model') >>> ht.tag('What is the airspeed of an unladen swallow ?'.split()) [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] >>> ht.close() This class communicates with the hunpos-tag binary via pipes. When the tagger object is no longer needed, the close() method should be called to free system resources. The class supports the context manager interface; if used in a with statement, the close() method is invoked automatically: >>> with HunposTagger('en_wsj.model') as ht: ... ht.tag('What is the airspeed of an unladen swallow ?'.split()) ... [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'VB'), ('?', '.')] """ def __init__(self, path_to_model, path_to_bin=None, encoding=_hunpos_charset, verbose=False): """ Starts the hunpos-tag executable and establishes a connection with it. :param path_to_model: The model file. :param path_to_bin: The hunpos-tag binary. :param encoding: The encoding used by the model. Unicode tokens passed to the tag() and tag_sents() methods are converted to this charset when they are sent to hunpos-tag. The default is ISO-8859-1 (Latin-1). This parameter is ignored for str tokens, which are sent as-is. The caller must ensure that tokens are encoded in the right charset. """ self._closed = True hunpos_paths = ['.', '/usr/bin', '/usr/local/bin', '/opt/local/bin', '/Applications/bin', '~/bin', '~/Applications/bin'] hunpos_paths = list(map(os.path.expanduser, hunpos_paths)) self._hunpos_bin = find_binary( 'hunpos-tag', path_to_bin, env_vars=('HUNPOS_TAGGER',), searchpath=hunpos_paths, url=_hunpos_url, verbose=verbose ) self._hunpos_model = find_file( path_to_model, env_vars=('HUNPOS_TAGGER',), verbose=verbose) self._encoding = encoding self._hunpos = Popen([self._hunpos_bin, self._hunpos_model], shell=False, stdin=PIPE, stdout=PIPE, stderr=PIPE) self._closed = False def __del__(self): self.close() def close(self): """Closes the pipe to the hunpos executable.""" if not self._closed: self._hunpos.communicate() self._closed = True def __enter__(self): return self def __exit__(self, exc_type, exc_value, traceback): self.close() def tag(self, tokens): """Tags a single sentence: a list of words. The tokens should not contain any newline characters. """ for token in tokens: assert "\n" not in token, "Tokens should not contain newlines" if isinstance(token, compat.text_type): token = token.encode(self._encoding) self._hunpos.stdin.write(token + b"\n") # We write a final empty line to tell hunpos that the sentence is finished: self._hunpos.stdin.write(b"\n") self._hunpos.stdin.flush() tagged_tokens = [] for token in tokens: tagged = self._hunpos.stdout.readline().strip().split(b"\t") tag = (tagged[1] if len(tagged) > 1 else None) tagged_tokens.append((token, tag)) # We have to read (and dismiss) the final empty line: self._hunpos.stdout.readline() return tagged_tokens # skip doctests if Hunpos tagger is not installed def setup_module(module): from nose import SkipTest try: HunposTagger('en_wsj.model') except LookupError: raise SkipTest("HunposTagger is not available") nltk-3.1/nltk/tag/mapping.py0000644000076500000240000000610512607224144015610 0ustar sbstaff00000000000000# Natural Language Toolkit: Tagset Mapping # # Copyright (C) 2001-2015 NLTK Project # Author: Nathan Schneider # Steven Bird # URL: # For license information, see LICENSE.TXT """ Interface for converting POS tags from various treebanks to the universal tagset of Petrov, Das, & McDonald. The tagset consists of the following 12 coarse tags: VERB - verbs (all tenses and modes) NOUN - nouns (common and proper) PRON - pronouns ADJ - adjectives ADV - adverbs ADP - adpositions (prepositions and postpositions) CONJ - conjunctions DET - determiners NUM - cardinal numbers PRT - particles or other function words X - other: foreign words, typos, abbreviations . - punctuation @see: http://arxiv.org/abs/1104.2086 and http://code.google.com/p/universal-pos-tags/ """ from __future__ import print_function, unicode_literals, division from collections import defaultdict from os.path import join from nltk.data import load _UNIVERSAL_DATA = "taggers/universal_tagset" _UNIVERSAL_TAGS = ('VERB','NOUN','PRON','ADJ','ADV','ADP','CONJ','DET','NUM','PRT','X','.') # _MAPPINGS = defaultdict(lambda: defaultdict(dict)) # the mapping between tagset T1 and T2 returns UNK if appied to an unrecognized tag _MAPPINGS = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: 'UNK'))) def _load_universal_map(fileid): contents = load(join(_UNIVERSAL_DATA, fileid+'.map'), format="text") # When mapping to the Universal Tagset, # map unknown inputs to 'X' not 'UNK' _MAPPINGS[fileid]['universal'].default_factory = lambda: 'X' for line in contents.splitlines(): line = line.strip() if line == '': continue fine, coarse = line.split('\t') assert coarse in _UNIVERSAL_TAGS, 'Unexpected coarse tag: {}'.format(coarse) assert fine not in _MAPPINGS[fileid]['universal'], 'Multiple entries for original tag: {}'.format(fine) _MAPPINGS[fileid]['universal'][fine] = coarse def tagset_mapping(source, target): """ Retrieve the mapping dictionary between tagsets. >>> tagset_mapping('ru-rnc', 'universal') == {'!': '.', 'A': 'ADJ', 'C': 'CONJ', 'AD': 'ADV',\ 'NN': 'NOUN', 'VG': 'VERB', 'COMP': 'CONJ', 'NC': 'NUM', 'VP': 'VERB', 'P': 'ADP',\ 'IJ': 'X', 'V': 'VERB', 'Z': 'X', 'VI': 'VERB', 'YES_NO_SENT': 'X', 'PTCL': 'PRT'} True """ if source not in _MAPPINGS or target not in _MAPPINGS[source]: if target == 'universal': _load_universal_map(source) return _MAPPINGS[source][target] def map_tag(source, target, source_tag): """ Maps the tag from the source tagset to the target tagset. >>> map_tag('en-ptb', 'universal', 'VBZ') 'VERB' >>> map_tag('en-ptb', 'universal', 'VBP') 'VERB' >>> map_tag('en-ptb', 'universal', '``') '.' """ # we need a systematic approach to naming if target == 'universal': if source == 'wsj': source = 'en-ptb' if source == 'brown': source = 'en-brown' return tagset_mapping(source, target)[source_tag] nltk-3.1/nltk/tag/perceptron.py0000644000076500000240000002616512607224144016346 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # This module is a port of the Textblob Averaged Perceptron Tagger # Author: Matthew Honnibal , # Long Duong (NLTK port) # URL: # # Copyright 2013 Matthew Honnibal # NLTK modifications Copyright 2015 The NLTK Project # # This module is provided under the terms of the MIT License. from __future__ import absolute_import from __future__ import print_function, division import random from collections import defaultdict import pickle import logging from nltk.tag.api import TaggerI from nltk.data import find from nltk.compat import python_2_unicode_compatible PICKLE = "averaged_perceptron_tagger.pickle" class AveragedPerceptron(object): '''An averaged perceptron, as implemented by Matthew Honnibal. See more implementation details here: http://spacy.io/blog/part-of-speech-POS-tagger-in-python/ ''' def __init__(self): # Each feature gets its own weight vector, so weights is a dict-of-dicts self.weights = {} self.classes = set() # The accumulated values, for the averaging. These will be keyed by # feature/clas tuples self._totals = defaultdict(int) # The last time the feature was changed, for the averaging. Also # keyed by feature/clas tuples # (tstamps is short for timestamps) self._tstamps = defaultdict(int) # Number of instances seen self.i = 0 def predict(self, features): '''Dot-product the features and current weights and return the best label.''' scores = defaultdict(float) for feat, value in features.items(): if feat not in self.weights or value == 0: continue weights = self.weights[feat] for label, weight in weights.items(): scores[label] += value * weight # Do a secondary alphabetic sort, for stability return max(self.classes, key=lambda label: (scores[label], label)) def update(self, truth, guess, features): '''Update the feature weights.''' def upd_feat(c, f, w, v): param = (f, c) self._totals[param] += (self.i - self._tstamps[param]) * w self._tstamps[param] = self.i self.weights[f][c] = w + v self.i += 1 if truth == guess: return None for f in features: weights = self.weights.setdefault(f, {}) upd_feat(truth, f, weights.get(truth, 0.0), 1.0) upd_feat(guess, f, weights.get(guess, 0.0), -1.0) def average_weights(self): '''Average weights from all iterations.''' for feat, weights in self.weights.items(): new_feat_weights = {} for clas, weight in weights.items(): param = (feat, clas) total = self._totals[param] total += (self.i - self._tstamps[param]) * weight averaged = round(total / float(self.i), 3) if averaged: new_feat_weights[clas] = averaged self.weights[feat] = new_feat_weights def save(self, path): '''Save the pickled model weights.''' with open(path, 'wb') as fout: return pickle.dump(dict(self.weights), fout) def load(self, path): '''Load the pickled model weights.''' with open(path,'rb') as fin: self.weights = pickle.load(fin) @python_2_unicode_compatible class PerceptronTagger(TaggerI): ''' Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. See more implementation details here: http://spacy.io/blog/part-of-speech-POS-tagger-in-python/ >>> from nltk.tag.perceptron import PerceptronTagger Train the model >>> tagger = PerceptronTagger(load=False) >>> tagger.train([[('today','NN'),('is','VBZ'),('good','JJ'),('day','NN')], ... [('yes','NNS'),('it','PRP'),('beautiful','JJ')]]) >>> tagger.tag(['today','is','a','beautiful','day']) [('today', 'NN'), ('is', 'PRP'), ('a', 'PRP'), ('beautiful', 'JJ'), ('day', 'NN')] Use the pretrain model (the default constructor) >>> pretrain = PerceptronTagger() >>> pretrain.tag('The quick brown fox jumps over the lazy dog'.split()) [('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')] >>> pretrain.tag("The red cat".split()) [('The', 'DT'), ('red', 'JJ'), ('cat', 'NN')] ''' START = ['-START-', '-START2-'] END = ['-END-', '-END2-'] def __init__(self, load=True): ''' :param load: Load the pickled model upon instantiation. ''' self.model = AveragedPerceptron() self.tagdict = {} self.classes = set() if load: AP_MODEL_LOC = str(find('taggers/averaged_perceptron_tagger/'+PICKLE)) self.load(AP_MODEL_LOC) def tag(self, tokens): ''' Tag tokenized sentences. :params tokens: list of word :type tokens: list(str) ''' prev, prev2 = self.START output = [] context = self.START + [self.normalize(w) for w in tokens] + self.END for i, word in enumerate(tokens): tag = self.tagdict.get(word) if not tag: features = self._get_features(i, word, context, prev, prev2) tag = self.model.predict(features) output.append((word, tag)) prev2 = prev prev = tag return output def train(self, sentences, save_loc=None, nr_iter=5): '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` controls the number of Perceptron training iterations. :param sentences: A list of (words, tags) tuples. :param save_loc: If not ``None``, saves a pickled model in this location. :param nr_iter: Number of training iterations. ''' self._make_tagdict(sentences) self.model.classes = self.classes for iter_ in range(nr_iter): c = 0 n = 0 for sentence in sentences: words = [word for word,tag in sentence] tags = [tag for word,tag in sentence] prev, prev2 = self.START context = self.START + [self.normalize(w) for w in words] \ + self.END for i, word in enumerate(words): guess = self.tagdict.get(word) if not guess: feats = self._get_features(i, word, context, prev, prev2) guess = self.model.predict(feats) self.model.update(tags[i], guess, feats) prev2 = prev prev = guess c += guess == tags[i] n += 1 random.shuffle(sentences) logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n))) self.model.average_weights() # Pickle as a binary file if save_loc is not None: with open(save_loc, 'wb') as fout: pickle.dump((self.model.weights, self.tagdict, self.classes), fout, -1) def load(self, loc): ''' :param loc: Load a pickled model at location. :type loc: str ''' with open(loc, 'rb') as fin: w_td_c = pickle.load(fin) self.model.weights, self.tagdict, self.classes = w_td_c self.model.classes = self.classes def normalize(self, word): ''' Normalization used in pre-processing. - All words are lower cased - Digits in the range 1800-2100 are represented as !YEAR; - Other digits are represented as !DIGITS :rtype: str ''' if '-' in word and word[0] != '-': return '!HYPHEN' elif word.isdigit() and len(word) == 4: return '!YEAR' elif word[0].isdigit(): return '!DIGITS' else: return word.lower() def _get_features(self, i, word, context, prev, prev2): '''Map tokens into a feature representation, implemented as a {hashable: float} dict. If the features change, a new model must be trained. ''' def add(name, *args): features[' '.join((name,) + tuple(args))] += 1 i += len(self.START) features = defaultdict(int) # It's useful to have a constant feature, which acts sort of like a prior add('bias') add('i suffix', word[-3:]) add('i pref1', word[0]) add('i-1 tag', prev) add('i-2 tag', prev2) add('i tag+i-2 tag', prev, prev2) add('i word', context[i]) add('i-1 tag+i word', prev, context[i]) add('i-1 word', context[i-1]) add('i-1 suffix', context[i-1][-3:]) add('i-2 word', context[i-2]) add('i+1 word', context[i+1]) add('i+1 suffix', context[i+1][-3:]) add('i+2 word', context[i+2]) return features def _make_tagdict(self, sentences): ''' Make a tag dictionary for single-tag words. :param sentences: A list of list of (word, tag) tuples. ''' counts = defaultdict(lambda: defaultdict(int)) for sentence in sentences: for word, tag in sentence: counts[word][tag] += 1 self.classes.add(tag) freq_thresh = 20 ambiguity_thresh = 0.97 for word, tag_freqs in counts.items(): tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) n = sum(tag_freqs.values()) # Don't add rare words to the tag dictionary # Only add quite unambiguous words if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: self.tagdict[word] = tag def _pc(n, d): return (float(n) / d) * 100 def _load_data_conll_format(filename): print ('Read from file: ', filename) with open(filename,'rb') as fin: sentences = [] sentence = [] for line in fin.readlines(): line = line.strip() #print line if len(line) ==0: sentences.append(sentence) sentence = [] continue tokens = line.split('\t') word = tokens[1] tag = tokens[4] sentence.append((word,tag)) return sentences def _get_pretrain_model(): # Train and test on English part of ConLL data (WSJ part of Penn Treebank) # Train: section 2-11 # Test : section 23 tagger = PerceptronTagger() training = _load_data_conll_format('english_ptb_train.conll') testing = _load_data_conll_format('english_ptb_test.conll') print ('Size of training and testing (sentence)', len(training), len(testing)) # Train and save the model tagger.train(training, PICKLE) print ('Accuracy : ',tagger.evaluate(testing)) if __name__ == '__main__': #_get_pretrain_model() pass nltk-3.1/nltk/tag/senna.py0000644000076500000240000001377412607224144015273 0ustar sbstaff00000000000000# encoding: utf-8 # Natural Language Toolkit: Senna POS Tagger # # Copyright (C) 2001-2015 NLTK Project # Author: Rami Al-Rfou' # URL: # For license information, see LICENSE.TXT """ Senna POS tagger, NER Tagger, Chunk Tagger The input is: - path to the directory that contains SENNA executables. If the path is incorrect, SennaTagger will automatically search for executable file specified in SENNA environment variable - (optionally) the encoding of the input data (default:utf-8) >>> from nltk.tag import SennaTagger >>> tagger = SennaTagger('/usr/share/senna-v2.0') >>> tagger.tag('What is the airspeed of an unladen swallow ?'.split()) [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'NN'), ('swallow', 'NN'), ('?', '.')] >>> from nltk.tag import SennaChunkTagger >>> chktagger = SennaChunkTagger('/usr/share/senna-v2.0') >>> chktagger.tag('What is the airspeed of an unladen swallow ?'.split()) [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')] >>> from nltk.tag import SennaNERTagger >>> nertagger = SennaNERTagger('/usr/share/senna-v2.0') >>> nertagger.tag('Shakespeare theatre was in London .'.split()) [('Shakespeare', 'B-PER'), ('theatre', 'O'), ('was', 'O'), ('in', 'O'), ('London', 'B-LOC'), ('.', 'O')] >>> nertagger.tag('UN headquarters are in NY , USA .'.split()) [('UN', 'B-ORG'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('NY', 'B-LOC'), (',', 'O'), ('USA', 'B-LOC'), ('.', 'O')] """ from nltk.compat import python_2_unicode_compatible from nltk.classify import Senna @python_2_unicode_compatible class SennaTagger(Senna): def __init__(self, path, encoding='utf-8'): super(SennaTagger, self).__init__(path, ['pos'], encoding) def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return for each sentence a list of tuples of (word, tag). """ tagged_sents = super(SennaTagger, self).tag_sents(sentences) for i in range(len(tagged_sents)): for j in range(len(tagged_sents[i])): annotations = tagged_sents[i][j] tagged_sents[i][j] = (annotations['word'], annotations['pos']) return tagged_sents @python_2_unicode_compatible class SennaChunkTagger(Senna): def __init__(self, path, encoding='utf-8'): super(SennaChunkTagger, self).__init__(path, ['chk'], encoding) def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return for each sentence a list of tuples of (word, tag). """ tagged_sents = super(SennaChunkTagger, self).tag_sents(sentences) for i in range(len(tagged_sents)): for j in range(len(tagged_sents[i])): annotations = tagged_sents[i][j] tagged_sents[i][j] = (annotations['word'], annotations['chk']) return tagged_sents def bio_to_chunks(self, tagged_sent, chunk_type): """ Extracts the chunks in a BIO chunk-tagged sentence. >>> from nltk.tag import SennaChunkTagger >>> chktagger = SennaChunkTagger('/usr/share/senna-v2.0') >>> sent = 'What is the airspeed of an unladen swallow ?'.split() >>> tagged_sent = chktagger.tag(sent) >>> tagged_sent [('What', 'B-NP'), ('is', 'B-VP'), ('the', 'B-NP'), ('airspeed', 'I-NP'), ('of', 'B-PP'), ('an', 'B-NP'), ('unladen', 'I-NP'), ('swallow', 'I-NP'), ('?', 'O')] >>> list(chktagger.bio_to_chunks(tagged_sent, chunk_type='NP')) [('What', '0'), ('the airspeed', '2-3'), ('an unladen swallow', '5-6-7')] :param tagged_sent: A list of tuples of word and BIO chunk tag. :type tagged_sent: list(tuple) :param tagged_sent: The chunk tag that users want to extract, e.g. 'NP' or 'VP' :type tagged_sent: str :return: An iterable of tuples of chunks that users want to extract and their corresponding indices. :rtype: iter(tuple(str)) """ current_chunk = [] current_chunk_position = [] for idx, word_pos in enumerate(tagged_sent): word, pos = word_pos if '-'+chunk_type in pos: # Append the word to the current_chunk. current_chunk.append((word)) current_chunk_position.append((idx)) else: if current_chunk: # Flush the full chunk when out of an NP. _chunk_str = ' '.join(current_chunk) _chunk_pos_str = '-'.join(map(str, current_chunk_position)) yield _chunk_str, _chunk_pos_str current_chunk = [] current_chunk_position = [] if current_chunk: # Flush the last chunk. yield ' '.join(current_chunk), '-'.join(map(str, current_chunk_position)) @python_2_unicode_compatible class SennaNERTagger(Senna): def __init__(self, path, encoding='utf-8'): super(SennaNERTagger, self).__init__(path, ['ner'], encoding) def tag_sents(self, sentences): """ Applies the tag method over a list of sentences. This method will return for each sentence a list of tuples of (word, tag). """ tagged_sents = super(SennaNERTagger, self).tag_sents(sentences) for i in range(len(tagged_sents)): for j in range(len(tagged_sents[i])): annotations = tagged_sents[i][j] tagged_sents[i][j] = (annotations['word'], annotations['ner']) return tagged_sents # skip doctests if Senna is not installed def setup_module(module): from nose import SkipTest try: tagger = Senna('/usr/share/senna-v2.0', ['pos', 'chk', 'ner']) except OSError: raise SkipTest("Senna executable not found") nltk-3.1/nltk/tag/sequential.py0000644000076500000240000006730512607224144016340 0ustar sbstaff00000000000000# Natural Language Toolkit: Sequential Backoff Taggers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # Tiago Tresoldi (original affix tagger) # URL: # For license information, see LICENSE.TXT """ Classes for tagging sentences sequentially, left to right. The abstract base class SequentialBackoffTagger serves as the base class for all the taggers in this module. Tagging of individual words is performed by the method ``choose_tag()``, which is defined by subclasses of SequentialBackoffTagger. If a tagger is unable to determine a tag for the specified token, then its backoff tagger is consulted instead. Any SequentialBackoffTagger may serve as a backoff tagger for any other SequentialBackoffTagger. """ from __future__ import print_function, unicode_literals import re from nltk.probability import ConditionalFreqDist from nltk.classify import NaiveBayesClassifier from nltk.compat import python_2_unicode_compatible from nltk.tag.api import TaggerI, FeaturesetTaggerI from nltk import jsontags ###################################################################### #{ Abstract Base Classes ###################################################################### class SequentialBackoffTagger(TaggerI): """ An abstract base class for taggers that tags words sequentially, left to right. Tagging of individual words is performed by the ``choose_tag()`` method, which should be defined by subclasses. If a tagger is unable to determine a tag for the specified token, then its backoff tagger is consulted. :ivar _taggers: A list of all the taggers that should be tried to tag a token (i.e., self and its backoff taggers). """ def __init__(self, backoff=None): if backoff is None: self._taggers = [self] else: self._taggers = [self] + backoff._taggers @property def backoff(self): """The backoff tagger for this tagger.""" return self._taggers[1] if len(self._taggers) > 1 else None def tag(self, tokens): # docs inherited from TaggerI tags = [] for i in range(len(tokens)): tags.append(self.tag_one(tokens, i, tags)) return list(zip(tokens, tags)) def tag_one(self, tokens, index, history): """ Determine an appropriate tag for the specified token, and return that tag. If this tagger is unable to determine a tag for the specified token, then its backoff tagger is consulted. :rtype: str :type tokens: list :param tokens: The list of words that are being tagged. :type index: int :param index: The index of the word whose tag should be returned. :type history: list(str) :param history: A list of the tags for all words before *index*. """ tag = None for tagger in self._taggers: tag = tagger.choose_tag(tokens, index, history) if tag is not None: break return tag def choose_tag(self, tokens, index, history): """ Decide which tag should be used for the specified token, and return that tag. If this tagger is unable to determine a tag for the specified token, return None -- do not consult the backoff tagger. This method should be overridden by subclasses of SequentialBackoffTagger. :rtype: str :type tokens: list :param tokens: The list of words that are being tagged. :type index: int :param index: The index of the word whose tag should be returned. :type history: list(str) :param history: A list of the tags for all words before *index*. """ raise NotImplementedError() @python_2_unicode_compatible class ContextTagger(SequentialBackoffTagger): """ An abstract base class for sequential backoff taggers that choose a tag for a token based on the value of its "context". Different subclasses are used to define different contexts. A ContextTagger chooses the tag for a token by calculating the token's context, and looking up the corresponding tag in a table. This table can be constructed manually; or it can be automatically constructed based on a training corpus, using the ``_train()`` factory method. :ivar _context_to_tag: Dictionary mapping contexts to tags. """ def __init__(self, context_to_tag, backoff=None): """ :param context_to_tag: A dictionary mapping contexts to tags. :param backoff: The backoff tagger that should be used for this tagger. """ SequentialBackoffTagger.__init__(self, backoff) self._context_to_tag = (context_to_tag if context_to_tag else {}) def context(self, tokens, index, history): """ :return: the context that should be used to look up the tag for the specified token; or None if the specified token should not be handled by this tagger. :rtype: (hashable) """ raise NotImplementedError() def choose_tag(self, tokens, index, history): context = self.context(tokens, index, history) return self._context_to_tag.get(context) def size(self): """ :return: The number of entries in the table used by this tagger to map from contexts to tags. """ return len(self._context_to_tag) def __repr__(self): return '<%s: size=%d>' % (self.__class__.__name__, self.size()) def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this ContextTagger's ``_context_to_tag`` table based on the given training data. In particular, for each context ``c`` in the training data, set ``_context_to_tag[c]`` to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of ``self._context_to_tag`` (if any) is discarded. :param tagged_corpus: A tagged corpus. Each item should be a list of (word, tag tuples. :param cutoff: If the most likely tag for a context occurs fewer than cutoff times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context][tag] += 1 # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print("[Trained Unigram tagger:", end=' ') print("size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)) ###################################################################### #{ Tagger Classes ###################################################################### @python_2_unicode_compatible @jsontags.register_tag class DefaultTagger(SequentialBackoffTagger): """ A tagger that assigns the same tag to every token. >>> from nltk.tag import DefaultTagger >>> default_tagger = DefaultTagger('NN') >>> list(default_tagger.tag('This is a test'.split())) [('This', 'NN'), ('is', 'NN'), ('a', 'NN'), ('test', 'NN')] This tagger is recommended as a backoff tagger, in cases where a more powerful tagger is unable to assign a tag to the word (e.g. because the word was not seen during training). :param tag: The tag to assign to each token :type tag: str """ json_tag = 'nltk.tag.sequential.DefaultTagger' def __init__(self, tag): self._tag = tag SequentialBackoffTagger.__init__(self, None) def encode_json_obj(self): return self._tag @classmethod def decode_json_obj(cls, obj): tag = obj return cls(tag) def choose_tag(self, tokens, index, history): return self._tag # ignore token and history def __repr__(self): return '' % self._tag @jsontags.register_tag class NgramTagger(ContextTagger): """ A tagger that chooses a token's tag based on its word string and on the preceding n word's tags. In particular, a tuple (tags[i-n:i-1], words[i]) is looked up in a table, and the corresponding tag is returned. N-gram taggers are typically trained on a tagged corpus. Train a new NgramTagger using the given training data or the supplied model. In particular, construct a new tagger whose table maps from each context (tag[i-n:i-1], word[i]) to the most frequent tag for that context. But exclude any contexts that are already tagged perfectly by the backoff tagger. :param train: A tagged corpus consisting of a list of tagged sentences, where each sentence is a list of (word, tag) tuples. :param backoff: A backoff tagger, to be used by the new tagger if it encounters an unknown context. :param cutoff: If the most likely tag for a context occurs fewer than *cutoff* times, then exclude it from the context-to-tag table for the new tagger. """ json_tag = 'nltk.tag.sequential.NgramTagger' def __init__(self, n, train=None, model=None, backoff=None, cutoff=0, verbose=False): self._n = n self._check_params(train, model) ContextTagger.__init__(self, model, backoff) if train: self._train(train, cutoff, verbose) def encode_json_obj(self): return self._n, self._context_to_tag, self.backoff @classmethod def decode_json_obj(cls, obj): _n, _context_to_tag, backoff = obj return cls(_n, model=_context_to_tag, backoff=backoff) def context(self, tokens, index, history): tag_context = tuple(history[max(0,index-self._n+1):index]) return tag_context, tokens[index] @jsontags.register_tag class UnigramTagger(NgramTagger): """ Unigram Tagger The UnigramTagger finds the most likely tag for each word in a training corpus, and then uses that information to assign tags to new tokens. >>> from nltk.corpus import brown >>> from nltk.tag import UnigramTagger >>> test_sent = brown.sents(categories='news')[0] >>> unigram_tagger = UnigramTagger(brown.tagged_sents(categories='news')[:500]) >>> for tok, tag in unigram_tagger.tag(test_sent): ... print("(%s, %s), " % (tok, tag)) (The, AT), (Fulton, NP-TL), (County, NN-TL), (Grand, JJ-TL), (Jury, NN-TL), (said, VBD), (Friday, NR), (an, AT), (investigation, NN), (of, IN), (Atlanta's, NP$), (recent, JJ), (primary, NN), (election, NN), (produced, VBD), (``, ``), (no, AT), (evidence, NN), ('', ''), (that, CS), (any, DTI), (irregularities, NNS), (took, VBD), (place, NN), (., .), :param train: The corpus of training data, a list of tagged sentences :type train: list(list(tuple(str, str))) :param model: The tagger model :type model: dict :param backoff: Another tagger which this tagger will consult when it is unable to tag a word :type backoff: TaggerI :param cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger :type cutoff: int """ json_tag = 'nltk.tag.sequential.UnigramTagger' def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 1, train, model, backoff, cutoff, verbose) def encode_json_obj(self): return self._context_to_tag, self.backoff @classmethod def decode_json_obj(cls, obj): _context_to_tag, backoff = obj return cls(model=_context_to_tag, backoff=backoff) def context(self, tokens, index, history): return tokens[index] @jsontags.register_tag class BigramTagger(NgramTagger): """ A tagger that chooses a token's tag based its word string and on the preceding words' tag. In particular, a tuple consisting of the previous tag and the word is looked up in a table, and the corresponding tag is returned. :param train: The corpus of training data, a list of tagged sentences :type train: list(list(tuple(str, str))) :param model: The tagger model :type model: dict :param backoff: Another tagger which this tagger will consult when it is unable to tag a word :type backoff: TaggerI :param cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger :type cutoff: int """ json_tag = 'nltk.tag.sequential.BigramTagger' def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 2, train, model, backoff, cutoff, verbose) def encode_json_obj(self): return self._context_to_tag, self.backoff @classmethod def decode_json_obj(cls, obj): _context_to_tag, backoff = obj return cls(model=_context_to_tag, backoff=backoff) @jsontags.register_tag class TrigramTagger(NgramTagger): """ A tagger that chooses a token's tag based its word string and on the preceding two words' tags. In particular, a tuple consisting of the previous two tags and the word is looked up in a table, and the corresponding tag is returned. :param train: The corpus of training data, a list of tagged sentences :type train: list(list(tuple(str, str))) :param model: The tagger model :type model: dict :param backoff: Another tagger which this tagger will consult when it is unable to tag a word :type backoff: TaggerI :param cutoff: The number of instances of training data the tagger must see in order not to use the backoff tagger :type cutoff: int """ json_tag = 'nltk.tag.sequential.TrigramTagger' def __init__(self, train=None, model=None, backoff=None, cutoff=0, verbose=False): NgramTagger.__init__(self, 3, train, model, backoff, cutoff, verbose) def encode_json_obj(self): return self._context_to_tag, self.backoff @classmethod def decode_json_obj(cls, obj): _context_to_tag, backoff = obj return cls(model=_context_to_tag, backoff=backoff) @jsontags.register_tag class AffixTagger(ContextTagger): """ A tagger that chooses a token's tag based on a leading or trailing substring of its word string. (It is important to note that these substrings are not necessarily "true" morphological affixes). In particular, a fixed-length substring of the word is looked up in a table, and the corresponding tag is returned. Affix taggers are typically constructed by training them on a tagged corpus. Construct a new affix tagger. :param affix_length: The length of the affixes that should be considered during training and tagging. Use negative numbers for suffixes. :param min_stem_length: Any words whose length is less than min_stem_length+abs(affix_length) will be assigned a tag of None by this tagger. """ json_tag = 'nltk.tag.sequential.AffixTagger' def __init__(self, train=None, model=None, affix_length=-3, min_stem_length=2, backoff=None, cutoff=0, verbose=False): self._check_params(train, model) ContextTagger.__init__(self, model, backoff) self._affix_length = affix_length self._min_word_length = min_stem_length + abs(affix_length) if train: self._train(train, cutoff, verbose) def encode_json_obj(self): return self._affix_length, self._min_word_length, self._context_to_tag, self.backoff @classmethod def decode_json_obj(cls, obj): _affix_length, _min_word_length, _context_to_tag, backoff = obj return cls( affix_length=_affix_length, min_stem_length=_min_word_length - abs(_affix_length), model=_context_to_tag, backoff=backoff ) def context(self, tokens, index, history): token = tokens[index] if len(token) < self._min_word_length: return None elif self._affix_length > 0: return token[:self._affix_length] else: return token[self._affix_length:] @python_2_unicode_compatible @jsontags.register_tag class RegexpTagger(SequentialBackoffTagger): """ Regular Expression Tagger The RegexpTagger assigns tags to tokens by comparing their word strings to a series of regular expressions. The following tagger uses word suffixes to make guesses about the correct Brown Corpus part of speech tag: >>> from nltk.corpus import brown >>> from nltk.tag import RegexpTagger >>> test_sent = brown.sents(categories='news')[0] >>> regexp_tagger = RegexpTagger( ... [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers ... (r'(The|the|A|a|An|an)$', 'AT'), # articles ... (r'.*able$', 'JJ'), # adjectives ... (r'.*ness$', 'NN'), # nouns formed from adjectives ... (r'.*ly$', 'RB'), # adverbs ... (r'.*s$', 'NNS'), # plural nouns ... (r'.*ing$', 'VBG'), # gerunds ... (r'.*ed$', 'VBD'), # past tense verbs ... (r'.*', 'NN') # nouns (default) ... ]) >>> regexp_tagger >>> regexp_tagger.tag(test_sent) [('The', 'AT'), ('Fulton', 'NN'), ('County', 'NN'), ('Grand', 'NN'), ('Jury', 'NN'), ('said', 'NN'), ('Friday', 'NN'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'NN'), ("Atlanta's", 'NNS'), ('recent', 'NN'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', 'NN'), ('no', 'NN'), ('evidence', 'NN'), ("''", 'NN'), ('that', 'NN'), ('any', 'NN'), ('irregularities', 'NNS'), ('took', 'NN'), ('place', 'NN'), ('.', 'NN')] :type regexps: list(tuple(str, str)) :param regexps: A list of ``(regexp, tag)`` pairs, each of which indicates that a word matching ``regexp`` should be tagged with ``tag``. The pairs will be evalutated in order. If none of the regexps match a word, then the optional backoff tagger is invoked, else it is assigned the tag None. """ json_tag = 'nltk.tag.sequential.RegexpTagger' def __init__(self, regexps, backoff=None): """ """ SequentialBackoffTagger.__init__(self, backoff) labels = ['g'+str(i) for i in range(len(regexps))] tags = [tag for regex, tag in regexps] self._map = dict(zip(labels, tags)) regexps_labels = [(regex, label) for ((regex,tag),label) in zip(regexps,labels)] self._regexs = re.compile('|'.join('(?P<%s>%s)' % (label, regex) for regex,label in regexps_labels)) self._size=len(regexps) def encode_json_obj(self): return self._map, self._regexs.pattern, self._size, self.backoff @classmethod def decode_json_obj(cls, obj): _map, _regexs, _size, backoff = obj self = cls(()) self._map = _map self._regexs = re.compile(_regexs) self._size = _size SequentialBackoffTagger.__init__(self, backoff) return self def choose_tag(self, tokens, index, history): m = self._regexs.match(tokens[index]) if m: return self._map[m.lastgroup] return None def __repr__(self): return '' % self._size @python_2_unicode_compatible class ClassifierBasedTagger(SequentialBackoffTagger, FeaturesetTaggerI): """ A sequential tagger that uses a classifier to choose the tag for each token in a sentence. The featureset input for the classifier is generated by a feature detector function:: feature_detector(tokens, index, history) -> featureset Where tokens is the list of unlabeled tokens in the sentence; index is the index of the token for which feature detection should be performed; and history is list of the tags for all tokens before index. Construct a new classifier-based sequential tagger. :param feature_detector: A function used to generate the featureset input for the classifier:: feature_detector(tokens, index, history) -> featureset :param train: A tagged corpus consisting of a list of tagged sentences, where each sentence is a list of (word, tag) tuples. :param backoff: A backoff tagger, to be used by the new tagger if it encounters an unknown context. :param classifier_builder: A function used to train a new classifier based on the data in *train*. It should take one argument, a list of labeled featuresets (i.e., (featureset, label) tuples). :param classifier: The classifier that should be used by the tagger. This is only useful if you want to manually construct the classifier; normally, you would use *train* instead. :param backoff: A backoff tagger, used if this tagger is unable to determine a tag for a given token. :param cutoff_prob: If specified, then this tagger will fall back on its backoff tagger if the probability of the most likely tag is less than *cutoff_prob*. """ def __init__(self, feature_detector=None, train=None, classifier_builder=NaiveBayesClassifier.train, classifier=None, backoff=None, cutoff_prob=None, verbose=False): self._check_params(train, classifier) SequentialBackoffTagger.__init__(self, backoff) if (train and classifier) or (not train and not classifier): raise ValueError('Must specify either training data or ' 'trained classifier.') if feature_detector is not None: self._feature_detector = feature_detector # The feature detector function, used to generate a featureset # or each token: feature_detector(tokens, index, history) -> featureset self._cutoff_prob = cutoff_prob """Cutoff probability for tagging -- if the probability of the most likely tag is less than this, then use backoff.""" self._classifier = classifier """The classifier used to choose a tag for each token.""" if train: self._train(train, classifier_builder, verbose) def choose_tag(self, tokens, index, history): # Use our feature detector to get the featureset. featureset = self.feature_detector(tokens, index, history) # Use the classifier to pick a tag. If a cutoff probability # was specified, then check that the tag's probability is # higher than that cutoff first; otherwise, return None. if self._cutoff_prob is None: return self._classifier.classify(featureset) pdist = self._classifier.prob_classify(featureset) tag = pdist.max() return tag if pdist.prob(tag) >= self._cutoff_prob else None def _train(self, tagged_corpus, classifier_builder, verbose): """ Build a new classifier, based on the given training data *tagged_corpus*. """ classifier_corpus = [] if verbose: print('Constructing training corpus for classifier.') for sentence in tagged_corpus: history = [] untagged_sentence, tags = zip(*sentence) for index in range(len(sentence)): featureset = self.feature_detector(untagged_sentence, index, history) classifier_corpus.append( (featureset, tags[index]) ) history.append(tags[index]) if verbose: print('Training classifier (%d instances)' % len(classifier_corpus)) self._classifier = classifier_builder(classifier_corpus) def __repr__(self): return '' % self._classifier def feature_detector(self, tokens, index, history): """ Return the feature detector that this tagger uses to generate featuresets for its classifier. The feature detector is a function with the signature:: feature_detector(tokens, index, history) -> featureset See ``classifier()`` """ return self._feature_detector(tokens, index, history) def classifier(self): """ Return the classifier that this tagger uses to choose a tag for each word in a sentence. The input for this classifier is generated using this tagger's feature detector. See ``feature_detector()`` """ return self._classifier class ClassifierBasedPOSTagger(ClassifierBasedTagger): """ A classifier based part of speech tagger. """ def feature_detector(self, tokens, index, history): word = tokens[index] if index == 0: prevword = prevprevword = None prevtag = prevprevtag = None elif index == 1: prevword = tokens[index-1].lower() prevprevword = None prevtag = history[index-1] prevprevtag = None else: prevword = tokens[index-1].lower() prevprevword = tokens[index-2].lower() prevtag = history[index-1] prevprevtag = history[index-2] if re.match('[0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$', word): shape = 'number' elif re.match('\W+$', word): shape = 'punct' elif re.match('[A-Z][a-z]+$', word): shape = 'upcase' elif re.match('[a-z]+$', word): shape = 'downcase' elif re.match('\w+$', word): shape = 'mixedcase' else: shape = 'other' features = { 'prevtag': prevtag, 'prevprevtag': prevprevtag, 'word': word, 'word.lower': word.lower(), 'suffix3': word.lower()[-3:], 'suffix2': word.lower()[-2:], 'suffix1': word.lower()[-1:], 'prevprevword': prevprevword, 'prevword': prevword, 'prevtag+word': '%s+%s' % (prevtag, word.lower()), 'prevprevtag+word': '%s+%s' % (prevprevtag, word.lower()), 'prevword+word': '%s+%s' % (prevword, word.lower()), 'shape': shape, } return features nltk-3.1/nltk/tag/stanford.py0000644000076500000240000001665412607224144016007 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Interface to the Stanford Part-of-speech and Named-Entity Taggers # # Copyright (C) 2001-2015 NLTK Project # Author: Nitin Madnani # Rami Al-Rfou' # URL: # For license information, see LICENSE.TXT """ A module for interfacing with the Stanford taggers. Tagger models need to be downloaded from http://nlp.stanford.edu/software and the STANFORD_MODELS environment variable set (a colon-separated list of paths). For more details see the documentation for StanfordPOSTagger and StanfordNERTagger. """ import os import tempfile from subprocess import PIPE import warnings from nltk.internals import find_file, find_jar, config_java, java, _java_options from nltk.tag.api import TaggerI from nltk import compat _stanford_url = 'http://nlp.stanford.edu/software' class StanfordTagger(TaggerI): """ An interface to Stanford taggers. Subclasses must define: - ``_cmd`` property: A property that returns the command that will be executed. - ``_SEPARATOR``: Class constant that represents that character that is used to separate the tokens from their tags. - ``_JAR`` file: Class constant that represents the jar file name. """ _SEPARATOR = '' _JAR = '' def __init__(self, model_filename, path_to_jar=None, encoding='utf8', verbose=False, java_options='-mx1000m'): if not self._JAR: warnings.warn('The StanfordTagger class is not meant to be ' 'instantiated directly. Did you mean StanfordPOSTagger or StanfordNERTagger?') self._stanford_jar = find_jar( self._JAR, path_to_jar, searchpath=(), url=_stanford_url, verbose=verbose) self._stanford_model = find_file(model_filename, env_vars=('STANFORD_MODELS',), verbose=verbose) self._encoding = encoding self.java_options = java_options @property def _cmd(self): raise NotImplementedError def tag(self, tokens): # This function should return list of tuple rather than list of list return sum(self.tag_sents([tokens]), []) def tag_sents(self, sentences): encoding = self._encoding default_options = ' '.join(_java_options) config_java(options=self.java_options, verbose=False) # Create a temporary input file _input_fh, self._input_file_path = tempfile.mkstemp(text=True) cmd = list(self._cmd) cmd.extend(['-encoding', encoding]) # Write the actual sentences to the temporary input file _input_fh = os.fdopen(_input_fh, 'wb') _input = '\n'.join((' '.join(x) for x in sentences)) if isinstance(_input, compat.text_type) and encoding: _input = _input.encode(encoding) _input_fh.write(_input) _input_fh.close() # Run the tagger and get the output stanpos_output, _stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stanpos_output = stanpos_output.decode(encoding) # Delete the temporary file os.unlink(self._input_file_path) # Return java configurations to their default values config_java(options=default_options, verbose=False) return self.parse_output(stanpos_output, sentences) def parse_output(self, text, sentences = None): # Output the tagged sentences tagged_sentences = [] for tagged_sentence in text.strip().split("\n"): sentence = [] for tagged_word in tagged_sentence.strip().split(): word_tags = tagged_word.strip().split(self._SEPARATOR) sentence.append((''.join(word_tags[:-1]), word_tags[-1])) tagged_sentences.append(sentence) return tagged_sentences class StanfordPOSTagger(StanfordTagger): """ A class for pos tagging with Stanford Tagger. The input is the paths to: - a model trained on training data - (optionally) the path to the stanford tagger jar file. If not specified here, then this jar file must be specified in the CLASSPATH envinroment variable. - (optionally) the encoding of the training data (default: UTF-8) Example: >>> from nltk.tag import StanfordPOSTagger >>> st = StanfordPOSTagger('english-bidirectional-distsim.tagger') # doctest: +SKIP >>> st.tag('What is the airspeed of an unladen swallow ?'.split()) # doctest: +SKIP [('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'), ('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'), ('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')] """ _SEPARATOR = '_' _JAR = 'stanford-postagger.jar' def __init__(self, *args, **kwargs): super(StanfordPOSTagger, self).__init__(*args, **kwargs) @property def _cmd(self): return ['edu.stanford.nlp.tagger.maxent.MaxentTagger', '-model', self._stanford_model, '-textFile', self._input_file_path, '-tokenize', 'false','-outputFormatOptions', 'keepEmptySentences'] class StanfordNERTagger(StanfordTagger): """ A class for Named-Entity Tagging with Stanford Tagger. The input is the paths to: - a model trained on training data - (optionally) the path to the stanford tagger jar file. If not specified here, then this jar file must be specified in the CLASSPATH envinroment variable. - (optionally) the encoding of the training data (default: UTF-8) Example: >>> from nltk.tag import StanfordNERTagger >>> st = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz') # doctest: +SKIP >>> st.tag('Rami Eid is studying at Stony Brook University in NY'.split()) # doctest: +SKIP [('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'), ('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'LOCATION')] """ _SEPARATOR = '/' _JAR = 'stanford-ner.jar' _FORMAT = 'slashTags' def __init__(self, *args, **kwargs): super(StanfordNERTagger, self).__init__(*args, **kwargs) @property def _cmd(self): # Adding -tokenizerFactory edu.stanford.nlp.process.WhitespaceTokenizer -tokenizerOptions tokenizeNLs=false for not using stanford Tokenizer return ['edu.stanford.nlp.ie.crf.CRFClassifier', '-loadClassifier', self._stanford_model, '-textFile', self._input_file_path, '-outputFormat', self._FORMAT, '-tokenizerFactory', 'edu.stanford.nlp.process.WhitespaceTokenizer', '-tokenizerOptions','\"tokenizeNLs=false\"'] def parse_output(self, text, sentences): if self._FORMAT == 'slashTags': # Joint together to a big list tagged_sentences = [] for tagged_sentence in text.strip().split("\n"): for tagged_word in tagged_sentence.strip().split(): word_tags = tagged_word.strip().split(self._SEPARATOR) tagged_sentences.append((''.join(word_tags[:-1]), word_tags[-1])) # Separate it according to the input result = [] start = 0 for sent in sentences: result.append(tagged_sentences[start:start + len(sent)]) start += len(sent); return result raise NotImplementedError nltk-3.1/nltk/tag/tnt.py0000755000076500000240000004355212607224144014774 0ustar sbstaff00000000000000# Natural Language Toolkit: TnT Tagger # # Copyright (C) 2001-2015 NLTK Project # Author: Sam Huston # # URL: # For license information, see LICENSE.TXT ''' Implementation of 'TnT - A Statisical Part of Speech Tagger' by Thorsten Brants http://acl.ldc.upenn.edu/A/A00/A00-1031.pdf ''' from __future__ import print_function from math import log from operator import itemgetter from nltk.probability import FreqDist, ConditionalFreqDist from nltk.tag.api import TaggerI class TnT(TaggerI): ''' TnT - Statistical POS tagger IMPORTANT NOTES: * DOES NOT AUTOMATICALLY DEAL WITH UNSEEN WORDS - It is possible to provide an untrained POS tagger to create tags for unknown words, see __init__ function * SHOULD BE USED WITH SENTENCE-DELIMITED INPUT - Due to the nature of this tagger, it works best when trained over sentence delimited input. - However it still produces good results if the training data and testing data are separated on all punctuation eg: [,.?!] - Input for training is expected to be a list of sentences where each sentence is a list of (word, tag) tuples - Input for tag function is a single sentence Input for tagdata function is a list of sentences Output is of a similar form * Function provided to process text that is unsegmented - Please see basic_sent_chop() TnT uses a second order Markov model to produce tags for a sequence of input, specifically: argmax [Proj(P(t_i|t_i-1,t_i-2)P(w_i|t_i))] P(t_T+1 | t_T) IE: the maximum projection of a set of probabilities The set of possible tags for a given word is derived from the training data. It is the set of all tags that exact word has been assigned. To speed up and get more precision, we can use log addition to instead multiplication, specifically: argmax [Sigma(log(P(t_i|t_i-1,t_i-2))+log(P(w_i|t_i)))] + log(P(t_T+1|t_T)) The probability of a tag for a given word is the linear interpolation of 3 markov models; a zero-order, first-order, and a second order model. P(t_i| t_i-1, t_i-2) = l1*P(t_i) + l2*P(t_i| t_i-1) + l3*P(t_i| t_i-1, t_i-2) A beam search is used to limit the memory usage of the algorithm. The degree of the beam can be changed using N in the initialization. N represents the maximum number of possible solutions to maintain while tagging. It is possible to differentiate the tags which are assigned to capitalized words. However this does not result in a significant gain in the accuracy of the results. ''' def __init__(self, unk=None, Trained=False, N=1000, C=False): ''' Construct a TnT statistical tagger. Tagger must be trained before being used to tag input. :param unk: instance of a POS tagger, conforms to TaggerI :type unk:(TaggerI) :param Trained: Indication that the POS tagger is trained or not :type Trained: boolean :param N: Beam search degree (see above) :type N:(int) :param C: Capitalization flag :type C: boolean Initializer, creates frequency distributions to be used for tagging _lx values represent the portion of the tri/bi/uni taggers to be used to calculate the probability N value is the number of possible solutions to maintain while tagging. A good value for this is 1000 C is a boolean value which specifies to use or not use the Capitalization of the word as additional information for tagging. NOTE: using capitalization may not increase the accuracy of the tagger ''' self._uni = FreqDist() self._bi = ConditionalFreqDist() self._tri = ConditionalFreqDist() self._wd = ConditionalFreqDist() self._eos = ConditionalFreqDist() self._l1 = 0.0 self._l2 = 0.0 self._l3 = 0.0 self._N = N self._C = C self._T = Trained self._unk = unk # statistical tools (ignore or delete me) self.unknown = 0 self.known = 0 def train(self, data): ''' Uses a set of tagged data to train the tagger. If an unknown word tagger is specified, it is trained on the same data. :param data: List of lists of (word, tag) tuples :type data: tuple(str) ''' # Ensure that local C flag is initialized before use C = False if self._unk is not None and self._T == False: self._unk.train(data) for sent in data: history = [('BOS',False), ('BOS',False)] for w, t in sent: # if capitalization is requested, # and the word begins with a capital # set local flag C to True if self._C and w[0].isupper(): C=True self._wd[w][t] += 1 self._uni[(t,C)] += 1 self._bi[history[1]][(t,C)] += 1 self._tri[tuple(history)][(t,C)] += 1 history.append((t,C)) history.pop(0) # set local flag C to false for the next word C = False self._eos[t]['EOS'] += 1 # compute lambda values from the trained frequency distributions self._compute_lambda() #(debugging -- ignore or delete me) #print "lambdas" #print i, self._l1, i, self._l2, i, self._l3 def _compute_lambda(self): ''' creates lambda values based upon training data NOTE: no need to explicitly reference C, it is contained within the tag variable :: tag == (tag,C) for each tag trigram (t1, t2, t3) depending on the maximum value of - f(t1,t2,t3)-1 / f(t1,t2)-1 - f(t2,t3)-1 / f(t2)-1 - f(t3)-1 / N-1 increment l3,l2, or l1 by f(t1,t2,t3) ISSUES -- Resolutions: if 2 values are equal, increment both lambda values by (f(t1,t2,t3) / 2) ''' # temporary lambda variables tl1 = 0.0 tl2 = 0.0 tl3 = 0.0 # for each t1,t2 in system for history in self._tri.conditions(): (h1, h2) = history # for each t3 given t1,t2 in system # (NOTE: tag actually represents (tag,C)) # However no effect within this function for tag in self._tri[history].keys(): # if there has only been 1 occurrence of this tag in the data # then ignore this trigram. if self._uni[tag] == 1: continue # safe_div provides a safe floating point division # it returns -1 if the denominator is 0 c3 = self._safe_div((self._tri[history][tag]-1), (self._tri[history].N()-1)) c2 = self._safe_div((self._bi[h2][tag]-1), (self._bi[h2].N()-1)) c1 = self._safe_div((self._uni[tag]-1), (self._uni.N()-1)) # if c1 is the maximum value: if (c1 > c3) and (c1 > c2): tl1 += self._tri[history][tag] # if c2 is the maximum value elif (c2 > c3) and (c2 > c1): tl2 += self._tri[history][tag] # if c3 is the maximum value elif (c3 > c2) and (c3 > c1): tl3 += self._tri[history][tag] # if c3, and c2 are equal and larger than c1 elif (c3 == c2) and (c3 > c1): tl2 += float(self._tri[history][tag]) /2.0 tl3 += float(self._tri[history][tag]) /2.0 # if c1, and c2 are equal and larger than c3 # this might be a dumb thing to do....(not sure yet) elif (c2 == c1) and (c1 > c3): tl1 += float(self._tri[history][tag]) /2.0 tl2 += float(self._tri[history][tag]) /2.0 # otherwise there might be a problem # eg: all values = 0 else: #print "Problem", c1, c2 ,c3 pass # Lambda normalisation: # ensures that l1+l2+l3 = 1 self._l1 = tl1 / (tl1+tl2+tl3) self._l2 = tl2 / (tl1+tl2+tl3) self._l3 = tl3 / (tl1+tl2+tl3) def _safe_div(self, v1, v2): ''' Safe floating point division function, does not allow division by 0 returns -1 if the denominator is 0 ''' if v2 == 0: return -1 else: return float(v1) / float(v2) def tagdata(self, data): ''' Tags each sentence in a list of sentences :param data:list of list of words :type data: [[string,],] :return: list of list of (word, tag) tuples Invokes tag(sent) function for each sentence compiles the results into a list of tagged sentences each tagged sentence is a list of (word, tag) tuples ''' res = [] for sent in data: res1 = self.tag(sent) res.append(res1) return res def tag(self, data): ''' Tags a single sentence :param data: list of words :type data: [string,] :return: [(word, tag),] Calls recursive function '_tagword' to produce a list of tags Associates the sequence of returned tags with the correct words in the input sequence returns a list of (word, tag) tuples ''' current_state = [(['BOS', 'BOS'], 0.0)] sent = list(data) tags = self._tagword(sent, current_state) res = [] for i in range(len(sent)): # unpack and discard the C flags (t,C) = tags[i+2] res.append((sent[i], t)) return res def _tagword(self, sent, current_states): ''' :param sent : List of words remaining in the sentence :type sent : [word,] :param current_states : List of possible tag combinations for the sentence so far, and the log probability associated with each tag combination :type current_states : [([tag, ], logprob), ] Tags the first word in the sentence and recursively tags the reminder of sentence Uses formula specified above to calculate the probability of a particular tag ''' # if this word marks the end of the sentance, # return the most probable tag if sent == []: (h, logp) = current_states[0] return h # otherwise there are more words to be tagged word = sent[0] sent = sent[1:] new_states = [] # if the Capitalisation is requested, # initalise the flag for this word C = False if self._C and word[0].isupper(): C=True # if word is known # compute the set of possible tags # and their associated log probabilities if word in self._wd.conditions(): self.known += 1 for (history, curr_sent_logprob) in current_states: logprobs = [] for t in self._wd[word].keys(): p_uni = self._uni.freq((t,C)) p_bi = self._bi[history[-1]].freq((t,C)) p_tri = self._tri[tuple(history[-2:])].freq((t,C)) p_wd = float(self._wd[word][t])/float(self._uni[(t,C)]) p = self._l1 *p_uni + self._l2 *p_bi + self._l3 *p_tri p2 = log(p, 2) + log(p_wd, 2) logprobs.append(((t,C), p2)) # compute the result of appending each tag to this history for (tag, logprob) in logprobs: new_states.append((history + [tag], curr_sent_logprob + logprob)) # otherwise a new word, set of possible tags is unknown else: self.unknown += 1 # since a set of possible tags, # and the probability of each specific tag # can not be returned from most classifiers: # specify that any unknown words are tagged with certainty p = 1 # if no unknown word tagger has been specified # then use the tag 'Unk' if self._unk is None: tag = ('Unk',C) # otherwise apply the unknown word tagger else : [(_w, t)] = list(self._unk.tag([word])) tag = (t,C) for (history, logprob) in current_states: history.append(tag) new_states = current_states # now have computed a set of possible new_states # sort states by log prob # set is now ordered greatest to least log probability new_states.sort(reverse=True, key=itemgetter(1)) # del everything after N (threshold) # this is the beam search cut if len(new_states) > self._N: new_states = new_states[:self._N] # compute the tags for the rest of the sentence # return the best list of tags for the sentence return self._tagword(sent, new_states) ######################################## # helper function -- basic sentence tokenizer ######################################## def basic_sent_chop(data, raw=True): ''' Basic method for tokenizing input into sentences for this tagger: :param data: list of tokens (words or (word, tag) tuples) :type data: str or tuple(str, str) :param raw: boolean flag marking the input data as a list of words or a list of tagged words :type raw: bool :return: list of sentences sentences are a list of tokens tokens are the same as the input Function takes a list of tokens and separates the tokens into lists where each list represents a sentence fragment This function can separate both tagged and raw sequences into basic sentences. Sentence markers are the set of [,.!?] This is a simple method which enhances the performance of the TnT tagger. Better sentence tokenization will further enhance the results. ''' new_data = [] curr_sent = [] sent_mark = [',','.','?','!'] if raw: for word in data: if word in sent_mark: curr_sent.append(word) new_data.append(curr_sent) curr_sent = [] else: curr_sent.append(word) else: for (word,tag) in data: if word in sent_mark: curr_sent.append((word,tag)) new_data.append(curr_sent) curr_sent = [] else: curr_sent.append((word,tag)) return new_data def demo(): from nltk.corpus import brown sents = list(brown.tagged_sents()) test = list(brown.sents()) # create and train the tagger tagger = TnT() tagger.train(sents[200:1000]) # tag some data tagged_data = tagger.tagdata(test[100:120]) # print results for j in range(len(tagged_data)): s = tagged_data[j] t = sents[j+100] for i in range(len(s)): print(s[i],'--', t[i]) print() def demo2(): from nltk.corpus import treebank d = list(treebank.tagged_sents()) t = TnT(N=1000, C=False) s = TnT(N=1000, C=True) t.train(d[(11)*100:]) s.train(d[(11)*100:]) for i in range(10): tacc = t.evaluate(d[i*100:((i+1)*100)]) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) t.unknown = 0 t.known = 0 print('Capitalization off:') print('Accuracy:', tacc) print('Percentage known:', tp_kn) print('Percentage unknown:', tp_un) print('Accuracy over known words:', (tacc / tp_kn)) sacc = s.evaluate(d[i*100:((i+1)*100)]) sp_un = float(s.unknown) / float(s.known +s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) s.unknown = 0 s.known = 0 print('Capitalization on:') print('Accuracy:', sacc) print('Percentage known:', sp_kn) print('Percentage unknown:', sp_un) print('Accuracy over known words:', (sacc / sp_kn)) def demo3(): from nltk.corpus import treebank, brown d = list(treebank.tagged_sents()) e = list(brown.tagged_sents()) d = d[:1000] e = e[:1000] d10 = int(len(d)*0.1) e10 = int(len(e)*0.1) tknacc = 0 sknacc = 0 tallacc = 0 sallacc = 0 tknown = 0 sknown = 0 for i in range(10): t = TnT(N=1000, C=False) s = TnT(N=1000, C=False) dtest = d[(i*d10):((i+1)*d10)] etest = e[(i*e10):((i+1)*e10)] dtrain = d[:(i*d10)] + d[((i+1)*d10):] etrain = e[:(i*e10)] + e[((i+1)*e10):] t.train(dtrain) s.train(etrain) tacc = t.evaluate(dtest) tp_un = float(t.unknown) / float(t.known +t.unknown) tp_kn = float(t.known) / float(t.known + t.unknown) tknown += tp_kn t.unknown = 0 t.known = 0 sacc = s.evaluate(etest) sp_un = float(s.unknown) / float(s.known + s.unknown) sp_kn = float(s.known) / float(s.known + s.unknown) sknown += sp_kn s.unknown = 0 s.known = 0 tknacc += (tacc / tp_kn) sknacc += (sacc / tp_kn) tallacc += tacc sallacc += sacc #print i+1, (tacc / tp_kn), i+1, (sacc / tp_kn), i+1, tacc, i+1, sacc print("brown: acc over words known:", 10 * tknacc) print(" : overall accuracy:", 10 * tallacc) print(" : words known:", 10 * tknown) print("treebank: acc over words known:", 10 * sknacc) print(" : overall accuracy:", 10 * sallacc) print(" : words known:", 10 * sknown) nltk-3.1/nltk/tag/util.py0000644000076500000240000000435112607224144015133 0ustar sbstaff00000000000000# Natural Language Toolkit: Tagger Utilities # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT def str2tuple(s, sep='/'): """ Given the string representation of a tagged token, return the corresponding tuple representation. The rightmost occurrence of *sep* in *s* will be used to divide *s* into a word string and a tag string. If *sep* does not occur in *s*, return (s, None). >>> from nltk.tag.util import str2tuple >>> str2tuple('fly/NN') ('fly', 'NN') :type s: str :param s: The string representation of a tagged token. :type sep: str :param sep: The separator string used to separate word strings from tags. """ loc = s.rfind(sep) if loc >= 0: return (s[:loc], s[loc+len(sep):].upper()) else: return (s, None) def tuple2str(tagged_token, sep='/'): """ Given the tuple representation of a tagged token, return the corresponding string representation. This representation is formed by concatenating the token's word string, followed by the separator, followed by the token's tag. (If the tag is None, then just return the bare word string.) >>> from nltk.tag.util import tuple2str >>> tagged_token = ('fly', 'NN') >>> tuple2str(tagged_token) 'fly/NN' :type tagged_token: tuple(str, str) :param tagged_token: The tuple representation of a tagged token. :type sep: str :param sep: The separator string used to separate word strings from tags. """ word, tag = tagged_token if tag is None: return word else: assert sep not in tag, 'tag may not contain sep!' return '%s%s%s' % (word, sep, tag) def untag(tagged_sentence): """ Given a tagged sentence, return an untagged version of that sentence. I.e., return a list containing the first element of each tuple in *tagged_sentence*. >>> from nltk.tag.util import untag >>> untag([('John', 'NNP'), ('saw', 'VBD'), ('Mary', 'NNP')]) ['John', 'saw', 'Mary'] """ return [w for (w, t) in tagged_sentence] nltk-3.1/nltk/tbl/0000755000076500000240000000000012610001541013573 5ustar sbstaff00000000000000nltk-3.1/nltk/tbl/__init__.py0000644000076500000240000000136412607224144015724 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2015 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT """ Transformation Based Learning A general purpose package for Transformation Based Learning, currently used by nltk.tag.BrillTagger. """ from nltk.tbl.template import Template #API: Template(...), Template.expand(...) from nltk.tbl.feature import Feature #API: Feature(...), Feature.expand(...) from nltk.tbl.rule import Rule #API: Rule.format(...), Rule.templatetid from nltk.tbl.erroranalysis import error_list nltk-3.1/nltk/tbl/api.py0000644000076500000240000000000112574600335014724 0ustar sbstaff00000000000000 nltk-3.1/nltk/tbl/demo.py0000644000076500000240000003457312607224144015121 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2015 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function, absolute_import, division import os import pickle import random import time from nltk.corpus import treebank from nltk.tbl import error_list, Template from nltk.tag.brill import Word, Pos from nltk.tag import BrillTaggerTrainer, RegexpTagger, UnigramTagger def demo(): """ Run a demo with defaults. See source comments for details, or docstrings of any of the more specific demo_* functions. """ postag() def demo_repr_rule_format(): """ Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose")) """ postag(ruleformat="repr") def demo_str_rule_format(): """ Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose")) """ postag(ruleformat="str") def demo_verbose_rule_format(): """ Exemplify Rule.format("verbose") """ postag(ruleformat="verbose") def demo_multiposition_feature(): """ The feature/s of a template takes a list of positions relative to the current word where the feature should be looked for, conceptually joined by logical OR. For instance, Pos([-1, 1]), given a value V, will hold whenever V is found one step to the left and/or one step to the right. For contiguous ranges, a 2-arg form giving inclusive end points can also be used: Pos(-3, -1) is the same as the arg below. """ postag(templates=[Template(Pos([-3,-2,-1]))]) def demo_multifeature_template(): """ Templates can have more than a single feature. """ postag(templates=[Template(Word([0]), Pos([-2,-1]))]) def demo_template_statistics(): """ Show aggregate statistics per template. Little used templates are candidates for deletion, much used templates may possibly be refined. Deleting unused templates is mostly about saving time and/or space: training is basically O(T) in the number of templates T (also in terms of memory usage, which often will be the limiting factor). """ postag(incremental_stats=True, template_stats=True) def demo_generated_templates(): """ Template.expand and Feature.expand are class methods facilitating generating large amounts of templates. See their documentation for details. Note: training with 500 templates can easily fill all available even on relatively small corpora """ wordtpls = Word.expand([-1,0,1], [1,2], excludezero=False) tagtpls = Pos.expand([-2,-1,0,1], [1,2], excludezero=True) templates = list(Template.expand([wordtpls, tagtpls], combinations=(1,3))) print("Generated {0} templates for transformation-based learning".format(len(templates))) postag(templates=templates, incremental_stats=True, template_stats=True) def demo_learning_curve(): """ Plot a learning curve -- the contribution on tagging accuracy of the individual rules. Note: requires matplotlib """ postag(incremental_stats=True, separate_baseline_data=True, learning_curve_output="learningcurve.png") def demo_error_analysis(): """ Writes a file with context for each erroneous word after tagging testing data """ postag(error_output="errors.txt") def demo_serialize_tagger(): """ Serializes the learned tagger to a file in pickle format; reloads it and validates the process. """ postag(serialize_output="tagger.pcl") def demo_high_accuracy_rules(): """ Discard rules with low accuracy. This may hurt performance a bit, but will often produce rules which are more interesting read to a human. """ postag(num_sents=3000, min_acc=0.96, min_score=10) def postag( templates=None, tagged_data=None, num_sents=1000, max_rules=300, min_score=3, min_acc=None, train=0.8, trace=3, randomize=False, ruleformat="str", incremental_stats=False, template_stats=False, error_output=None, serialize_output=None, learning_curve_output=None, learning_curve_take=300, baseline_backoff_tagger=None, separate_baseline_data=False, cache_baseline_tagger=None): """ Brill Tagger Demonstration :param templates: how many sentences of training and testing data to use :type templates: list of Template :param tagged_data: maximum number of rule instances to create :type tagged_data: C{int} :param num_sents: how many sentences of training and testing data to use :type num_sents: C{int} :param max_rules: maximum number of rule instances to create :type max_rules: C{int} :param min_score: the minimum score for a rule in order for it to be considered :type min_score: C{int} :param min_acc: the minimum score for a rule in order for it to be considered :type min_acc: C{float} :param train: the fraction of the the corpus to be used for training (1=all) :type train: C{float} :param trace: the level of diagnostic tracing output to produce (0-4) :type trace: C{int} :param randomize: whether the training data should be a random subset of the corpus :type randomize: C{bool} :param ruleformat: rule output format, one of "str", "repr", "verbose" :type ruleformat: C{str} :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow) :type incremental_stats: C{bool} :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing :type template_stats: C{bool} :param error_output: the file where errors will be saved :type error_output: C{string} :param serialize_output: the file where the learned tbl tagger will be saved :type serialize_output: C{string} :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available) :type learning_curve_output: C{string} :param learning_curve_take: how many rules plotted :type learning_curve_take: C{int} :param baseline_backoff_tagger: the file where rules will be saved :type baseline_backoff_tagger: tagger :param separate_baseline_data: use a fraction of the training data exclusively for training baseline :type separate_baseline_data: C{bool} :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get deterministic output from the baseline unigram tagger between python versions) :type cache_baseline_tagger: C{string} Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This is fast and fine for a demo, but is likely to generalize worse on unseen data. Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high). """ # defaults baseline_backoff_tagger = baseline_backoff_tagger or REGEXP_TAGGER if templates is None: from nltk.tag.brill import describe_template_sets, brill24 # some pre-built template sets taken from typical systems or publications are # available. Print a list with describe_template_sets() # for instance: templates = brill24() (training_data, baseline_data, gold_data, testing_data) = \ _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data) # creating (or reloading from cache) a baseline tagger (unigram tagger) # this is just a mechanism for getting deterministic output from the baseline between # python versions if cache_baseline_tagger: if not os.path.exists(cache_baseline_tagger): baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) with open(cache_baseline_tagger, 'w') as print_rules: pickle.dump(baseline_tagger, print_rules) print("Trained baseline tagger, pickled it to {0}".format(cache_baseline_tagger)) with open(cache_baseline_tagger, "r") as print_rules: baseline_tagger= pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(cache_baseline_tagger)) else: baseline_tagger = UnigramTagger(baseline_data, backoff=baseline_backoff_tagger) print("Trained baseline tagger") if gold_data: print(" Accuracy on test set: {0:0.4f}".format(baseline_tagger.evaluate(gold_data))) # creating a Brill tagger tbrill = time.time() trainer = BrillTaggerTrainer(baseline_tagger, templates, trace, ruleformat=ruleformat) print("Training tbl tagger...") brill_tagger = trainer.train(training_data, max_rules, min_score, min_acc) print("Trained tbl tagger in {0:0.2f} seconds".format(time.time() - tbrill)) if gold_data: print(" Accuracy on test set: %.4f" % brill_tagger.evaluate(gold_data)) # printing the learned rules, if learned silently if trace == 1: print("\nLearned rules: ") for (ruleno, rule) in enumerate(brill_tagger.rules(),1): print("{0:4d} {1:s}".format(ruleno, rule.format(ruleformat))) # printing template statistics (optionally including comparison with the training data) # note: if not separate_baseline_data, then baseline accuracy will be artificially high if incremental_stats: print("Incrementally tagging the test data, collecting individual rule statistics") (taggedtest, teststats) = brill_tagger.batch_tag_incremental(testing_data, gold_data) print(" Rule statistics collected") if not separate_baseline_data: print("WARNING: train_stats asked for separate_baseline_data=True; the baseline " "will be artificially high") trainstats = brill_tagger.train_stats() if template_stats: brill_tagger.print_template_statistics(teststats) if learning_curve_output: _demo_plot(learning_curve_output, teststats, trainstats, take=learning_curve_take) print("Wrote plot of learning curve to {0}".format(learning_curve_output)) else: print("Tagging the test data") taggedtest = brill_tagger.tag_sents(testing_data) if template_stats: brill_tagger.print_template_statistics() # writing error analysis to file if error_output is not None: with open(error_output, 'w') as f: f.write('Errors for Brill Tagger %r\n\n' % serialize_output) f.write(u'\n'.join(error_list(gold_data, taggedtest)).encode('utf-8') + '\n') print("Wrote tagger errors including context to {0}".format(error_output)) # serializing the tagger to a pickle file and reloading (just to see it works) if serialize_output is not None: taggedtest = brill_tagger.tag_sents(testing_data) with open(serialize_output, 'w') as print_rules: pickle.dump(brill_tagger, print_rules) print("Wrote pickled tagger to {0}".format(serialize_output)) with open(serialize_output, "r") as print_rules: brill_tagger_reloaded = pickle.load(print_rules) print("Reloaded pickled tagger from {0}".format(serialize_output)) taggedtest_reloaded = brill_tagger.tag_sents(testing_data) if taggedtest == taggedtest_reloaded: print("Reloaded tagger tried on test set, results identical") else: print("PROBLEM: Reloaded tagger gave different results on test set") def _demo_prepare_data(tagged_data, train, num_sents, randomize, separate_baseline_data): # train is the proportion of data used in training; the rest is reserved # for testing. if tagged_data is None: print("Loading tagged data from treebank... ") tagged_data = treebank.tagged_sents() if num_sents is None or len(tagged_data) <= num_sents: num_sents = len(tagged_data) if randomize: random.seed(len(tagged_data)) random.shuffle(tagged_data) cutoff = int(num_sents * train) training_data = tagged_data[:cutoff] gold_data = tagged_data[cutoff:num_sents] testing_data = [[t[0] for t in sent] for sent in gold_data] if not separate_baseline_data: baseline_data = training_data else: bl_cutoff = len(training_data) // 3 (baseline_data, training_data) = (training_data[:bl_cutoff], training_data[bl_cutoff:]) (trainseqs, traintokens) = corpus_size(training_data) (testseqs, testtokens) = corpus_size(testing_data) (bltrainseqs, bltraintokens) = corpus_size(baseline_data) print("Read testing data ({0:d} sents/{1:d} wds)".format(testseqs, testtokens)) print("Read training data ({0:d} sents/{1:d} wds)".format(trainseqs, traintokens)) print("Read baseline data ({0:d} sents/{1:d} wds) {2:s}".format( bltrainseqs, bltraintokens, "" if separate_baseline_data else "[reused the training set]")) return (training_data, baseline_data, gold_data, testing_data) def _demo_plot(learning_curve_output, teststats, trainstats=None, take=None): testcurve = [teststats['initialerrors']] for rulescore in teststats['rulescores']: testcurve.append(testcurve[-1] - rulescore) testcurve = [1 - x/teststats['tokencount'] for x in testcurve[:take]] traincurve = [trainstats['initialerrors']] for rulescore in trainstats['rulescores']: traincurve.append(traincurve[-1] - rulescore) traincurve = [1 - x/trainstats['tokencount'] for x in traincurve[:take]] import matplotlib.pyplot as plt r = list(range(len(testcurve))) plt.plot(r, testcurve, r, traincurve) plt.axis([None, None, None, 1.0]) plt.savefig(learning_curve_output) NN_CD_TAGGER = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')]) REGEXP_TAGGER = RegexpTagger( [(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers (r'(The|the|A|a|An|an)$', 'AT'), # articles (r'.*able$', 'JJ'), # adjectives (r'.*ness$', 'NN'), # nouns formed from adjectives (r'.*ly$', 'RB'), # adverbs (r'.*s$', 'NNS'), # plural nouns (r'.*ing$', 'VBG'), # gerunds (r'.*ed$', 'VBD'), # past tense verbs (r'.*', 'NN') # nouns (default) ]) def corpus_size(seqs): return (len(seqs), sum(len(x) for x in seqs)) if __name__ == '__main__': demo_learning_curve() nltk-3.1/nltk/tbl/erroranalysis.py0000644000076500000240000000271312607224144017061 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2015 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function # returns a list of errors in string format def error_list(train_sents, test_sents): """ Returns a list of human-readable strings indicating the errors in the given tagging of the corpus. :param train_sents: The correct tagging of the corpus :type train_sents: list(tuple) :param test_sents: The tagged corpus :type test_sents: list(tuple) """ hdr = (('%25s | %s | %s\n' + '-'*26+'+'+'-'*24+'+'+'-'*26) % ('left context', 'word/test->gold'.center(22), 'right context')) errors = [hdr] for (train_sent, test_sent) in zip(train_sents, test_sents): for wordnum, (word, train_pos) in enumerate(train_sent): test_pos = test_sent[wordnum][1] if train_pos != test_pos: left = ' '.join('%s/%s' % w for w in train_sent[:wordnum]) right = ' '.join('%s/%s' % w for w in train_sent[wordnum+1:]) mid = '%s/%s->%s' % (word, test_pos, train_pos) errors.append('%25s | %s | %s' % (left[-25:], mid.center(22), right[:25])) return errors nltk-3.1/nltk/tbl/feature.py0000644000076500000240000002251012607224144015614 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2015 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import division, print_function, unicode_literals class Feature(object): """ An abstract base class for Features. A Feature is a combination of a specific property-computing method and a list of relative positions to apply that method to. The property-computing method, M{extract_property(tokens, index)}, must be implemented by every subclass. It extracts or computes a specific property for the token at the current index. Typical extract_property() methods return features such as the token text or tag; but more involved methods may consider the entire sequence M{tokens} and for instance compute the length of the sentence the token belongs to. In addition, the subclass may have a PROPERTY_NAME, which is how it will be printed (in Rules and Templates, etc). If not given, defaults to the classname. """ # !!FOR_FUTURE: when targeting python3 only, consider @abc.abstractmethod # and metaclass=abc.ABCMeta rather than NotImplementedError # http://julien.danjou.info/blog/2013/guide-python-static-class-abstract-methods json_tag = 'nltk.tbl.Feature' PROPERTY_NAME = None def __init__(self, positions, end=None): """ Construct a Feature which may apply at C{positions}. #For instance, importing some concrete subclasses (Feature is abstract) >>> from nltk.tag.brill import Word, Pos #Feature Word, applying at one of [-2, -1] >>> Word([-2,-1]) Word([-2, -1]) #Positions need not be contiguous >>> Word([-2,-1, 1]) Word([-2, -1, 1]) #Contiguous ranges can alternatively be specified giving the #two endpoints (inclusive) >>> Pos(-3, -1) Pos([-3, -2, -1]) #In two-arg form, start <= end is enforced >>> Pos(2, 1) Traceback (most recent call last): File "", line 1, in File "nltk/tbl/template.py", line 306, in __init__ raise TypeError ValueError: illegal interval specification: (start=2, end=1) :type positions: list of int :param positions: the positions at which this features should apply :raises ValueError: illegal position specifications An alternative calling convention, for contiguous positions only, is Feature(start, end): :type start: int :param start: start of range where this feature should apply :type end: int :param end: end of range (NOTE: inclusive!) where this feature should apply """ self.positions = None # to avoid warnings if end is None: self.positions = tuple(sorted(set([int(i) for i in positions]))) else: # positions was actually not a list, but only the start index try: if positions > end: raise TypeError self.positions = tuple(range(positions, end+1)) except TypeError: # let any kind of erroneous spec raise ValueError raise ValueError("illegal interval specification: (start={0}, end={1})".format(positions, end)) # set property name given in subclass, or otherwise name of subclass self.PROPERTY_NAME = self.__class__.PROPERTY_NAME or self.__class__.__name__ def encode_json_obj(self): return self.positions @classmethod def decode_json_obj(cls, obj): positions = obj return cls(positions) def __repr__(self): return "%s(%r)" % ( self.__class__.__name__, list(self.positions)) @classmethod def expand(cls, starts, winlens, excludezero=False): """ Return a list of features, one for each start point in starts and for each window length in winlen. If excludezero is True, no Features containing 0 in its positions will be generated (many tbl trainers have a special representation for the target feature at [0]) For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word First argument gives the possible start positions, second the possible window lengths >>> Word.expand([-3,-2,-1], [1]) [Word([-3]), Word([-2]), Word([-1])] >>> Word.expand([-2,-1], [1]) [Word([-2]), Word([-1])] >>> Word.expand([-3,-2,-1], [1,2]) [Word([-3]), Word([-2]), Word([-1]), Word([-3, -2]), Word([-2, -1])] >>> Word.expand([-2,-1], [1]) [Word([-2]), Word([-1])] a third optional argument excludes all Features whose positions contain zero >>> Word.expand([-2,-1,0], [1,2], excludezero=False) [Word([-2]), Word([-1]), Word([0]), Word([-2, -1]), Word([-1, 0])] >>> Word.expand([-2,-1,0], [1,2], excludezero=True) [Word([-2]), Word([-1]), Word([-2, -1])] All window lengths must be positive >>> Word.expand([-2,-1], [0]) Traceback (most recent call last): File "", line 1, in File "nltk/tag/tbl/template.py", line 371, in expand :param starts: where to start looking for Feature ValueError: non-positive window length in [0] :param starts: where to start looking for Feature :type starts: list of ints :param winlens: window lengths where to look for Feature :type starts: list of ints :param excludezero: do not output any Feature with 0 in any of its positions. :type excludezero: bool :returns: list of Features :raises ValueError: for non-positive window lengths """ if not all(x > 0 for x in winlens): raise ValueError("non-positive window length in {0}".format(winlens)) xs = (starts[i:i+w] for w in winlens for i in range(len(starts)-w+1)) return [cls(x) for x in xs if not (excludezero and 0 in x)] def issuperset(self, other): """ Return True if this Feature always returns True when other does More precisely, return True if this feature refers to the same property as other; and this Feature looks at all positions that other does (and possibly other positions in addition). #For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word, Pos >>> Word([-3,-2,-1]).issuperset(Word([-3,-2])) True >>> Word([-3,-2,-1]).issuperset(Word([-3,-2, 0])) False #Feature subclasses must agree >>> Word([-3,-2,-1]).issuperset(Pos([-3,-2])) False :param other: feature with which to compare :type other: (subclass of) Feature :return: True if this feature is superset, otherwise False :rtype: bool """ return self.__class__ is other.__class__ and set(self.positions) >= set(other.positions) def intersects(self, other): """ Return True if the positions of this Feature intersects with those of other More precisely, return True if this feature refers to the same property as other; and there is some overlap in the positions they look at. #For instance, importing a concrete subclass (Feature is abstract) >>> from nltk.tag.brill import Word, Pos >>> Word([-3,-2,-1]).intersects(Word([-3,-2])) True >>> Word([-3,-2,-1]).intersects(Word([-3,-2, 0])) True >>> Word([-3,-2,-1]).intersects(Word([0])) False #Feature subclasses must agree >>> Word([-3,-2,-1]).intersects(Pos([-3,-2])) False :param other: feature with which to compare :type other: (subclass of) Feature :return: True if feature classes agree and there is some overlap in the positions they look at :rtype: bool """ return bool((self.__class__ is other.__class__ and set(self.positions) & set(other.positions))) # Rich comparisons for Features. With @functools.total_ordering (Python 2.7+), # it will be enough to define __lt__ and __eq__ def __eq__(self, other): return (self.__class__ is other.__class__ and self.positions == other.positions) def __lt__(self, other): return ( self.__class__.__name__ < other.__class__.__name__ or # self.positions is a sorted tuple of ints self.positions < other.positions ) def __ne__(self, other): return not (self == other) def __gt__(self, other): return other < self def __ge__(self, other): return not self < other def __le__(self, other): return self < other or self == other @staticmethod def extract_property(tokens, index): """ Any subclass of Feature must define static method extract_property(tokens, index) :param tokens: the sequence of tokens :type tokens: list of tokens :param index: the current index :type index: int :return: feature value :rtype: any (but usually scalar) """ raise NotImplementedError nltk-3.1/nltk/tbl/rule.py0000644000076500000240000002605212607224144015135 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2015 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function from nltk.compat import python_2_unicode_compatible, unicode_repr from nltk import jsontags ###################################################################### # Tag Rules ###################################################################### class TagRule(object): """ An interface for tag transformations on a tagged corpus, as performed by tbl taggers. Each transformation finds all tokens in the corpus that are tagged with a specific original tag and satisfy a specific condition, and replaces their tags with a replacement tag. For any given transformation, the original tag, replacement tag, and condition are fixed. Conditions may depend on the token under consideration, as well as any other tokens in the corpus. Tag rules must be comparable and hashable. """ def __init__(self, original_tag, replacement_tag): self.original_tag = original_tag """The tag which this TagRule may cause to be replaced.""" self.replacement_tag = replacement_tag """The tag with which this TagRule may replace another tag.""" def apply(self, tokens, positions=None): """ Apply this rule at every position in positions where it applies to the given sentence. I.e., for each position p in *positions*, if *tokens[p]* is tagged with this rule's original tag, and satisfies this rule's condition, then set its tag to be this rule's replacement tag. :param tokens: The tagged sentence :type tokens: list(tuple(str, str)) :type positions: list(int) :param positions: The positions where the transformation is to be tried. If not specified, try it at all positions. :return: The indices of tokens whose tags were changed by this rule. :rtype: int """ if positions is None: positions = list(range(len(tokens))) # Determine the indices at which this rule applies. change = [i for i in positions if self.applies(tokens, i)] # Make the changes. Note: this must be done in a separate # step from finding applicable locations, since we don't want # the rule to interact with itself. for i in change: tokens[i] = (tokens[i][0], self.replacement_tag) return change def applies(self, tokens, index): """ :return: True if the rule would change the tag of ``tokens[index]``, False otherwise :rtype: bool :param tokens: A tagged sentence :type tokens: list(str) :param index: The index to check :type index: int """ raise NotImplementedError # Rules must be comparable and hashable for the algorithm to work def __eq__(self, other): raise TypeError("Rules must implement __eq__()") def __ne__(self, other): raise TypeError("Rules must implement __ne__()") def __hash__(self): raise TypeError("Rules must implement __hash__()") @python_2_unicode_compatible @jsontags.register_tag class Rule(TagRule): """ A Rule checks the current corpus position for a certain set of conditions; if they are all fulfilled, the Rule is triggered, meaning that it will change tag A to tag B. For other tags than A, nothing happens. The conditions are parameters to the Rule instance. Each condition is a feature-value pair, with a set of positions to check for the value of the corresponding feature. Conceptually, the positions are joined by logical OR, and the feature set by logical AND. More formally, the Rule is then applicable to the M{n}th token iff: - The M{n}th token is tagged with the Rule's original tag; and - For each (Feature(positions), M{value}) tuple: - The value of Feature of at least one token in {n+p for p in positions} is M{value}. """ json_tag = 'nltk.tbl.Rule' def __init__(self, templateid, original_tag, replacement_tag, conditions): """ Construct a new Rule that changes a token's tag from C{original_tag} to C{replacement_tag} if all of the properties specified in C{conditions} hold. @type templateid: string @param templateid: the template id (a zero-padded string, '001' etc, so it will sort nicely) @type conditions: C{iterable} of C{Feature} @param conditions: A list of Feature(positions), each of which specifies that the property (computed by Feature.extract_property()) of at least one token in M{n} + p in positions is C{value}. """ TagRule.__init__(self, original_tag, replacement_tag) self._conditions = conditions self.templateid = templateid def encode_json_obj(self): return { 'templateid': self.templateid, 'original': self.original_tag, 'replacement': self.replacement_tag, 'conditions': self._conditions, } @classmethod def decode_json_obj(cls, obj): return cls(obj['templateid'], obj['original'], obj['replacement'], obj['conditions']) def applies(self, tokens, index): # Inherit docs from TagRule # Does the given token have this Rule's "original tag"? if tokens[index][1] != self.original_tag: return False # Check to make sure that every condition holds. for (feature, val) in self._conditions: # Look for *any* token that satisfies the condition. for pos in feature.positions: if not (0 <= index + pos < len(tokens)): continue if feature.extract_property(tokens, index+pos) == val: break else: # No token satisfied the condition; return false. return False # Every condition checked out, so the Rule is applicable. return True def __eq__(self, other): return (self is other or (other is not None and other.__class__ == self.__class__ and self.original_tag == other.original_tag and self.replacement_tag == other.replacement_tag and self._conditions == other._conditions)) def __ne__(self, other): return not (self == other) def __hash__(self): # Cache our hash value (justified by profiling.) try: return self.__hash except AttributeError: self.__hash = hash(repr(self)) return self.__hash def __repr__(self): # Cache the repr (justified by profiling -- this is used as # a sort key when deterministic=True.) try: return self.__repr except AttributeError: self.__repr = ( "{0}('{1}', {2}, {3}, [{4}])".format( self.__class__.__name__, self.templateid, unicode_repr(self.original_tag), unicode_repr(self.replacement_tag), # list(self._conditions) would be simpler but will not generate # the same Rule.__repr__ in python 2 and 3 and thus break some tests ', '.join("({0},{1})".format(f, unicode_repr(v)) for (f, v) in self._conditions) ) ) return self.__repr def __str__(self): def _condition_to_logic(feature, value): """ Return a compact, predicate-logic styled string representation of the given condition. """ return '{0}:{1}@[{2}]'.format( feature.PROPERTY_NAME, value, ",".join(str(w) for w in feature.positions) ) conditions = ' & '.join([_condition_to_logic(f, v) for (f, v) in self._conditions]) s = '{0}->{1} if {2}'.format( self.original_tag, self.replacement_tag, conditions ) return s def format(self, fmt): """ Return a string representation of this rule. >>> from nltk.tbl.rule import Rule >>> from nltk.tag.brill import Pos >>> r = Rule("23", "VB", "NN", [(Pos([-2,-1]), 'DT')]) r.format("str") == str(r) True >>> r.format("str") 'VB->NN if Pos:DT@[-2,-1]' r.format("repr") == repr(r) True >>> r.format("repr") "Rule('23', 'VB', 'NN', [(Pos([-2, -1]),'DT')])" >>> r.format("verbose") 'VB -> NN if the Pos of words i-2...i-1 is "DT"' >>> r.format("not_found") Traceback (most recent call last): File "", line 1, in File "nltk/tbl/rule.py", line 256, in format raise ValueError("unknown rule format spec: {0}".format(fmt)) ValueError: unknown rule format spec: not_found >>> :param fmt: format specification :type fmt: str :return: string representation :rtype: str """ if fmt == "str": return self.__str__() elif fmt == "repr": return self.__repr__() elif fmt == "verbose": return self._verbose_format() else: raise ValueError("unknown rule format spec: {0}".format(fmt)) def _verbose_format(self): """ Return a wordy, human-readable string representation of the given rule. Not sure how useful this is. """ def condition_to_str(feature, value): return ('the %s of %s is "%s"' % (feature.PROPERTY_NAME, range_to_str(feature.positions), value)) def range_to_str(positions): if len(positions) == 1: p = positions[0] if p == 0: return 'this word' if p == -1: return 'the preceding word' elif p == 1: return 'the following word' elif p < 0: return 'word i-%d' % -p elif p > 0: return 'word i+%d' % p else: # for complete compatibility with the wordy format of nltk2 mx = max(positions) mn = min(positions) if mx - mn == len(positions) - 1: return 'words i%+d...i%+d' % (mn, mx) else: return 'words {%s}' % (",".join("i%+d" % d for d in positions),) replacement = '%s -> %s' % (self.original_tag, self.replacement_tag) conditions = (' if ' if self._conditions else "") + ', and '.join( condition_to_str(f, v) for (f, v) in self._conditions ) return replacement + conditions nltk-3.1/nltk/tbl/template.py0000644000076500000240000003061612607224144016002 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Transformation-based learning # # Copyright (C) 2001-2015 NLTK Project # Author: Marcus Uneson # based on previous (nltk2) version by # Christopher Maloof, Edward Loper, Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function import itertools as it from nltk.tbl.feature import Feature class BrillTemplateI(object): """ An interface for generating lists of transformational rules that apply at given sentence positions. ``BrillTemplateI`` is used by ``Brill`` training algorithms to generate candidate rules. """ #!!FOR_FUTURE: when targeting python3 only, consider @abc.abstractmethod # and metaclass=abc.ABCMeta rather than NotImplementedError #http://julien.danjou.info/blog/2013/guide-python-static-class-abstract-methods def applicable_rules(self, tokens, i, correctTag): """ Return a list of the transformational rules that would correct the *i*th subtoken's tag in the given token. In particular, return a list of zero or more rules that would change *tokens*[i][1] to *correctTag*, if applied to *token*[i]. If the *i*th token already has the correct tag (i.e., if tagged_tokens[i][1] == correctTag), then ``applicable_rules()`` should return the empty list. :param tokens: The tagged tokens being tagged. :type tokens: list(tuple) :param i: The index of the token whose tag should be corrected. :type i: int :param correctTag: The correct tag for the *i*th token. :type correctTag: any :rtype: list(BrillRule) """ raise NotImplementedError def get_neighborhood(self, token, index): """ Returns the set of indices *i* such that ``applicable_rules(token, i, ...)`` depends on the value of the *index*th token of *token*. This method is used by the "fast" Brill tagger trainer. :param token: The tokens being tagged. :type token: list(tuple) :param index: The index whose neighborhood should be returned. :type index: int :rtype: set """ raise NotImplementedError from nltk.tbl.rule import Rule class Template(BrillTemplateI): """ A tbl Template that generates a list of L{Rule}s that apply at a given sentence position. In particular, each C{Template} is parameterized by a list of independent features (a combination of a specific property to extract and a list C{L} of relative positions at which to extract it) and generates all Rules that: - use the given features, each at its own independent position; and - are applicable to the given token. """ ALLTEMPLATES = [] #record a unique id of form "001", for each template created # _ids = it.count(0) def __init__(self, *features): """ Construct a Template for generating Rules. Takes a list of Features. A C{Feature} is a combination of a specific property and its relative positions and should be a subclass of L{nltk.tbl.feature.Feature}. An alternative calling convention (kept for backwards compatibility, but less expressive as it only permits one feature type) is Template(Feature, (start1, end1), (start2, end2), ...) In new code, that would be better written Template(Feature(start1, end1), Feature(start2, end2), ...) #For instance, importing some features >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Word, Pos #create some features >>> wfeat1, wfeat2, pfeat = (Word([-1]), Word([1,2]), Pos([-2,-1])) #Create a single-feature template >>> Template(wfeat1) Template(Word([-1])) #or a two-feature one >>> Template(wfeat1, wfeat2) Template(Word([-1]),Word([1, 2])) #or a three-feature one with two different feature types >>> Template(wfeat1, wfeat2, pfeat) Template(Word([-1]),Word([1, 2]),Pos([-2, -1])) #deprecated api: Feature subclass, followed by list of (start,end) pairs #(permits only a single Feature) >>> Template(Word, (-2,-1), (0,0)) Template(Word([-2, -1]),Word([0])) #incorrect specification raises TypeError >>> Template(Word, (-2,-1), Pos, (0,0)) Traceback (most recent call last): File "", line 1, in File "nltk/tag/tbl/template.py", line 143, in __init__ raise TypeError( TypeError: expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ... :type features: list of Features :param features: the features to build this Template on """ #determine the calling form: either #Template(Feature, args1, [args2, ...)] #Template(Feature1(args), Feature2(args), ...) if all(isinstance(f, Feature) for f in features): self._features = features elif issubclass(features[0], Feature) and all(isinstance(a, tuple) for a in features[1:]): self._features = [features[0](*tp) for tp in features[1:]] else: raise TypeError( "expected either Feature1(args), Feature2(args), ... or Feature, (start1, end1), (start2, end2), ...") self.id = "{0:03d}".format(len(self.ALLTEMPLATES)) self.ALLTEMPLATES.append(self) def __repr__(self): return "%s(%s)" % (self.__class__.__name__, ",".join([str(f) for f in self._features])) def applicable_rules(self, tokens, index, correct_tag): if tokens[index][1] == correct_tag: return [] # For each of this Template's features, find the conditions # that are applicable for the given token. # Then, generate one Rule for each combination of features # (the crossproduct of the conditions). applicable_conditions = self._applicable_conditions(tokens, index) xs = list(it.product(*applicable_conditions)) return [Rule(self.id, tokens[index][1], correct_tag, tuple(x)) for x in xs] def _applicable_conditions(self, tokens, index): """ :returns: A set of all conditions for rules that are applicable to C{tokens[index]}. """ conditions = [] for feature in self._features: conditions.append([]) for pos in feature.positions: if not (0 <= index+pos < len(tokens)): continue value = feature.extract_property(tokens, index+pos) conditions[-1].append( (feature, value) ) return conditions def get_neighborhood(self, tokens, index): # inherit docs from BrillTemplateI # applicable_rules(tokens, index, ...) depends on index. neighborhood = set([index]) #set literal for python 2.7+ # applicable_rules(tokens, i, ...) depends on index if # i+start < index <= i+end. allpositions = [0] + [p for feat in self._features for p in feat.positions] start, end = min(allpositions), max(allpositions) s = max(0, index+(-end)) e = min(index+(-start)+1, len(tokens)) for i in range(s, e): neighborhood.add(i) return neighborhood @classmethod def expand(cls, featurelists, combinations=None, skipintersecting=True): """ Factory method to mass generate Templates from a list L of lists of Features. #With combinations=(k1, k2), the function will in all possible ways choose k1 ... k2 #of the sublists in L; it will output all Templates formed by the Cartesian product #of this selection, with duplicates and other semantically equivalent #forms removed. Default for combinations is (1, len(L)). The feature lists may have been specified manually, or generated from Feature.expand(). For instance, >>> from nltk.tbl.template import Template >>> from nltk.tag.brill import Word, Pos #creating some features >>> (wd_0, wd_01) = (Word([0]), Word([0,1])) >>> (pos_m2, pos_m33) = (Pos([-2]), Pos([3-2,-1,0,1,2,3])) >>> list(Template.expand([[wd_0], [pos_m2]])) [Template(Word([0])), Template(Pos([-2])), Template(Pos([-2]),Word([0]))] >>> list(Template.expand([[wd_0, wd_01], [pos_m2]])) [Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-2]),Word([0])), Template(Pos([-2]),Word([0, 1]))] #note: with Feature.expand(), it is very easy to generate more templates #than your system can handle -- for instance, >>> wordtpls = Word.expand([-2,-1,0,1], [1,2], excludezero=False) >>> len(wordtpls) 7 >>> postpls = Pos.expand([-3,-2,-1,0,1,2], [1,2,3], excludezero=True) >>> len(postpls) 9 #and now the Cartesian product of all non-empty combinations of two wordtpls and #two postpls, with semantic equivalents removed >>> templates = list(Template.expand([wordtpls, wordtpls, postpls, postpls])) >>> len(templates) 713 will return a list of eight templates Template(Word([0])), Template(Word([0, 1])), Template(Pos([-2])), Template(Pos([-1])), Template(Pos([-2]),Word([0])), Template(Pos([-1]),Word([0])), Template(Pos([-2]),Word([0, 1])), Template(Pos([-1]),Word([0, 1]))] #Templates where one feature is a subset of another, such as #Template(Word([0,1]), Word([1]), will not appear in the output. #By default, this non-subset constraint is tightened to disjointness: #Templates of type Template(Word([0,1]), Word([1,2]) will also be filtered out. #With skipintersecting=False, then such Templates are allowed WARNING: this method makes it very easy to fill all your memory when training generated templates on any real-world corpus :param featurelists: lists of Features, whose Cartesian product will return a set of Templates :type featurelists: list of (list of Features) :param combinations: given n featurelists: if combinations=k, all generated Templates will have k features; if combinations=(k1,k2) they will have k1..k2 features; if None, defaults to 1..n :type combinations: None, int, or (int, int) :param skipintersecting: if True, do not output intersecting Templates (non-disjoint positions for some feature) :type skipintersecting: bool :returns: generator of Templates """ def nonempty_powerset(xs): #xs is a list #itertools docnonempty_powerset([1,2,3]) --> (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3) #find the correct tuple given combinations, one of {None, k, (k1,k2)} k = combinations #for brevity combrange = ((1, len(xs)+1) if k is None else #n over 1 .. n over n (all non-empty combinations) (k, k+1) if isinstance(k, int) else #n over k (only (k[0], k[1]+1)) #n over k1, n over k1+1... n over k2 return it.chain.from_iterable(it.combinations(xs, r) for r in range(*combrange)) seentemplates = set() for picks in nonempty_powerset(featurelists): for pick in it.product(*picks): if any(i != j and x.issuperset(y) for (i, x) in enumerate(pick) for (j,y) in enumerate(pick)): continue if skipintersecting and any(i != j and x.intersects(y) for (i, x) in enumerate(pick) for (j, y) in enumerate(pick)): continue thistemplate = cls(*sorted(pick)) strpick = str(thistemplate) #!!FIXME --this is hackish if strpick in seentemplates: #already added cls._poptemplate() continue seentemplates.add(strpick) yield thistemplate @classmethod def _cleartemplates(cls): cls.ALLTEMPLATES = [] @classmethod def _poptemplate(cls): return cls.ALLTEMPLATES.pop() if cls.ALLTEMPLATES else None nltk-3.1/nltk/test/0000755000076500000240000000000012610001541013771 5ustar sbstaff00000000000000nltk-3.1/nltk/test/__init__.py0000644000076500000240000000072012607224144016115 0ustar sbstaff00000000000000# Natural Language Toolkit: Unit Tests # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # URL: # For license information, see LICENSE.TXT """ Unit tests for the NLTK modules. These tests are intended to ensure that source code changes don't accidentally introduce bugs. For instructions, please see: ../../web/dev/local_testing.rst https://github.com/nltk/nltk/blob/develop/web/dev/local_testing.rst """ nltk-3.1/nltk/test/all.py0000644000076500000240000000144212574600335015133 0ustar sbstaff00000000000000"""Test suite that runs all NLTK tests. This module, `nltk.test.all`, is named as the NLTK ``test_suite`` in the project's ``setup-eggs.py`` file. Here, we create a test suite that runs all of our doctests, and return it for processing by the setuptools test harness. """ import doctest, unittest from glob import glob import os.path def additional_tests(): #print "here-000000000000000" #print "-----", glob(os.path.join(os.path.dirname(__file__), '*.doctest')) dir = os.path.dirname(__file__) paths = glob(os.path.join(dir, '*.doctest')) files = [ os.path.basename(path) for path in paths ] return unittest.TestSuite( [ doctest.DocFileSuite(file) for file in files ] ) #if os.path.split(path)[-1] != 'index.rst' # skips time-dependent doctest in index.rst nltk-3.1/nltk/test/bleu.doctest0000644000076500000240000000045012607224144016322 0ustar sbstaff00000000000000========== BLEU tests ========== >>> from nltk.translate import bleu If the candidate has no alignment to any of the references, the BLEU score is 0. >>> bleu( ... ['The candidate has no alignment to any of the references'.split()], ... 'John loves Mary'.split(), ... [1], ... ) 0 nltk-3.1/nltk/test/bnc.doctest0000644000076500000240000000370712607224144016145 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT >>> import os.path >>> from nltk.corpus.reader import BNCCorpusReader >>> import nltk.test >>> root = os.path.dirname(nltk.test.__file__) >>> bnc = BNCCorpusReader(root=root, fileids='FX8.xml') Checking the word access. ------------------------- >>> len(bnc.words()) 151 >>> bnc.words()[:6] ['Ah', 'there', 'we', 'are', ',', '.'] >>> bnc.words(stem=True)[:6] ['ah', 'there', 'we', 'be', ',', '.'] >>> bnc.tagged_words()[:6] [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')] >>> bnc.tagged_words(c5=True)[:6] [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')] Testing access to the sentences. -------------------------------- >>> len(bnc.sents()) 15 >>> bnc.sents()[0] ['Ah', 'there', 'we', 'are', ',', '.'] >>> bnc.sents(stem=True)[0] ['ah', 'there', 'we', 'be', ',', '.'] >>> bnc.tagged_sents()[0] [('Ah', 'INTERJ'), ('there', 'ADV'), ('we', 'PRON'), ('are', 'VERB'), (',', 'PUN'), ('.', 'PUN')] >>> bnc.tagged_sents(c5=True)[0] [('Ah', 'ITJ'), ('there', 'AV0'), ('we', 'PNP'), ('are', 'VBB'), (',', 'PUN'), ('.', 'PUN')] A not lazy loader. ------------------ >>> eager = BNCCorpusReader(root=root, fileids=r'FX8.xml', lazy=False) >>> len(eager.words()) 151 >>> eager.words(stem=True)[6:17] ['right', 'abdominal', 'wound', ',', 'she', 'be', 'a', 'wee', 'bit', 'confuse', '.'] >>> eager.tagged_words()[6:11] [('Right', 'ADV'), ('abdominal', 'ADJ'), ('wound', 'SUBST'), (',', 'PUN'), ('she', 'PRON')] >>> eager.tagged_words(c5=True)[6:17] [('Right', 'AV0'), ('abdominal', 'AJ0'), ('wound', 'NN1'), (',', 'PUN'), ('she', 'PNP'), ("'s", 'VBZ'), ('a', 'AT0'), ('wee', 'AJ0-NN1'), ('bit', 'NN1'), ('confused', 'VVN-AJ0'), ('.', 'PUN')] >>> len(eager.sents()) 15 nltk-3.1/nltk/test/ccg.doctest0000644000076500000240000005056712607224144016145 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ============================== Combinatory Categorial Grammar ============================== Relative Clauses ---------------- >>> from nltk.ccg import chart, lexicon Construct a lexicon: >>> lex = lexicon.parseLexicon(''' ... :- S, NP, N, VP ... ... Det :: NP/N ... Pro :: NP ... Modal :: S\\NP/VP ... ... TV :: VP/NP ... DTV :: TV/NP ... ... the => Det ... ... that => Det ... that => NP ... ... I => Pro ... you => Pro ... we => Pro ... ... chef => N ... cake => N ... children => N ... dough => N ... ... will => Modal ... should => Modal ... might => Modal ... must => Modal ... ... and => var\\.,var/.,var ... ... to => VP[to]/VP ... ... without => (VP\\VP)/VP[ing] ... ... be => TV ... cook => TV ... eat => TV ... ... cooking => VP[ing]/NP ... ... give => DTV ... ... is => (S\\NP)/NP ... prefer => (S\\NP)/NP ... ... which => (N\\N)/(S/NP) ... ... persuade => (VP/VP[to])/NP ... ''') >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> for parse in parser.parse("you prefer that cake".split()): # doctest: +SKIP ... chart.printCCGDerivation(parse) ... break ... you prefer that cake NP ((S\NP)/NP) (NP/N) N --------------> NP ---------------------------> (S\NP) --------------------------------< S >>> for parse in parser.parse("that is the cake which you prefer".split()): # doctest: +SKIP ... chart.printCCGDerivation(parse) ... break ... that is the cake which you prefer NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/NP) ----->T (S/(S\NP)) ------------------>B (S/NP) ----------------------------------> (N\N) ----------------------------------------< N ------------------------------------------------> NP -------------------------------------------------------------> (S\NP) -------------------------------------------------------------------< S Some other sentences to try: "that is the cake which we will persuade the chef to cook" "that is the cake which we will persuade the chef to give the children" >>> sent = "that is the dough which you will eat without cooking".split() >>> nosub_parser = chart.CCGChartParser(lex, chart.ApplicationRuleSet + ... chart.CompositionRuleSet + chart.TypeRaiseRuleSet) Without Substitution (no output) >>> for parse in nosub_parser.parse(sent): ... chart.printCCGDerivation(parse) With Substitution: >>> for parse in parser.parse(sent): # doctest: +SKIP ... chart.printCCGDerivation(parse) ... break ... that is the dough which you will eat without cooking NP ((S\NP)/NP) (NP/N) N ((N\N)/(S/NP)) NP ((S\NP)/VP) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) ----->T (S/(S\NP)) ------------------------------------->B ((VP\VP)/NP) ----------------------------------------------B ((S\NP)/NP) ---------------------------------------------------------------->B (S/NP) --------------------------------------------------------------------------------> (N\N) ---------------------------------------------------------------------------------------< N -----------------------------------------------------------------------------------------------> NP ------------------------------------------------------------------------------------------------------------> (S\NP) ------------------------------------------------------------------------------------------------------------------< S Conjunction ----------- >>> from nltk.ccg.chart import CCGChartParser, ApplicationRuleSet, CompositionRuleSet >>> from nltk.ccg.chart import SubstitutionRuleSet, TypeRaiseRuleSet, printCCGDerivation >>> from nltk.ccg import lexicon Lexicons for the tests: >>> test1_lex = ''' ... :- S,N,NP,VP ... I => NP ... you => NP ... will => S\\NP/VP ... cook => VP/NP ... which => (N\\N)/(S/NP) ... and => var\\.,var/.,var ... might => S\\NP/VP ... eat => VP/NP ... the => NP/N ... mushrooms => N ... parsnips => N''' >>> test2_lex = ''' ... :- N, S, NP, VP ... articles => N ... the => NP/N ... and => var\\.,var/.,var ... which => (N\\N)/(S/NP) ... I => NP ... anyone => NP ... will => (S/VP)\\NP ... file => VP/NP ... without => (VP\\VP)/VP[ing] ... forget => VP/NP ... reading => VP[ing]/NP ... ''' Tests handling of conjunctions. Note that while the two derivations are different, they are semantically equivalent. >>> lex = lexicon.parseLexicon(test1_lex) >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) >>> for parse in parser.parse("I will cook and might eat the mushrooms and parsnips".split()): ... printCCGDerivation(parse) # doctest: +NORMALIZE_WHITESPACE +SKIP I will cook and might eat the mushrooms and parsnips NP ((S\NP)/VP) (VP/NP) ((_var2\.,_var2)/.,_var2) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var2\.,_var2)/.,_var2) N ---------------------->B ((S\NP)/NP) ---------------------->B ((S\NP)/NP) -------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) -----------------------------------------------------------------------< ((S\NP)/NP) -------------------------------------> (N\.,N) ------------------------------------------------< N --------------------------------------------------------> NP -------------------------------------------------------------------------------------------------------------------------------> (S\NP) -----------------------------------------------------------------------------------------------------------------------------------< S I will cook and might eat the mushrooms and parsnips NP ((S\NP)/VP) (VP/NP) ((_var2\.,_var2)/.,_var2) ((S\NP)/VP) (VP/NP) (NP/N) N ((_var2\.,_var2)/.,_var2) N ---------------------->B ((S\NP)/NP) ---------------------->B ((S\NP)/NP) -------------------------------------------------> (((S\NP)/NP)\.,((S\NP)/NP)) -----------------------------------------------------------------------< ((S\NP)/NP) ------------------------------------------------------------------------------->B ((S\NP)/N) -------------------------------------> (N\.,N) ------------------------------------------------< N -------------------------------------------------------------------------------------------------------------------------------> (S\NP) -----------------------------------------------------------------------------------------------------------------------------------< S Tests handling subject extraction. Interesting to point that the two parses are clearly semantically different. >>> lex = lexicon.parseLexicon(test2_lex) >>> parser = CCGChartParser(lex, ApplicationRuleSet + CompositionRuleSet + SubstitutionRuleSet) >>> for parse in parser.parse("articles which I will file and forget without reading".split()): ... printCCGDerivation(parse) # doctest: +NORMALIZE_WHITESPACE +SKIP articles which I will file and forget without reading N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var3\.,_var3)/.,_var3) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) -----------------< (S/VP) ------------------------------------->B ((VP\VP)/NP) ---------------------------------------------- ((VP/NP)\.,(VP/NP)) ----------------------------------------------------------------------------------< (VP/NP) --------------------------------------------------------------------------------------------------->B (S/NP) -------------------------------------------------------------------------------------------------------------------> (N\N) -----------------------------------------------------------------------------------------------------------------------------< N articles which I will file and forget without reading N ((N\N)/(S/NP)) NP ((S/VP)\NP) (VP/NP) ((_var3\.,_var3)/.,_var3) (VP/NP) ((VP\VP)/VP['ing']) (VP['ing']/NP) -----------------< (S/VP) ------------------------------------> ((VP/NP)\.,(VP/NP)) ---------------------------------------------< (VP/NP) ------------------------------------->B ((VP\VP)/NP) ----------------------------------------------------------------------------------B (S/NP) -------------------------------------------------------------------------------------------------------------------> (N\N) -----------------------------------------------------------------------------------------------------------------------------< N Unicode support --------------- Unicode words are supported. >>> from nltk.ccg import chart, lexicon Lexicons for the tests: >>> lex = lexicon.parseLexicon(u''' ... :- S, N, NP, PP ... ... AdjI :: N\\N ... AdjD :: N/N ... AdvD :: S/S ... AdvI :: S\\S ... Det :: NP/N ... PrepNPCompl :: PP/NP ... PrepNAdjN :: S\\S/N ... PrepNAdjNP :: S\\S/NP ... VPNP :: S\\NP/NP ... VPPP :: S\\NP/PP ... VPser :: S\\NP/AdjI ... ... auto => N ... bebidas => N ... cine => N ... ley => N ... libro => N ... ministro => N ... panadería => N ... presidente => N ... super => N ... ... el => Det ... la => Det ... las => Det ... un => Det ... ... Ana => NP ... Pablo => NP ... ... y => var\\.,var/.,var ... ... pero => (S/NP)\\(S/NP)/(S/NP) ... ... anunció => VPNP ... compró => VPNP ... cree => S\\NP/S[dep] ... desmintió => VPNP ... lee => VPNP ... fueron => VPPP ... ... es => VPser ... ... interesante => AdjD ... interesante => AdjI ... nueva => AdjD ... nueva => AdjI ... ... a => PrepNPCompl ... en => PrepNAdjN ... en => PrepNAdjNP ... ... ayer => AdvI ... ... que => (NP\\NP)/(S/NP) ... que => S[dep]/S ... ''') >>> parser = chart.CCGChartParser(lex, chart.DefaultRuleSet) >>> for parse in parser.parse(u"el ministro anunció pero el presidente desmintió la nueva ley".split()): ... printCCGDerivation(parse) ... break el ministro anunció pero el presidente desmintió la nueva ley (NP/N) N ((S\NP)/NP) (((S/NP)\(S/NP))/(S/NP)) (NP/N) N ((S\NP)/NP) (NP/N) (N/N) N --------Leaf (NP/N) ----------Leaf N ------------------> NP ------------------>T (S/(S\NP)) -------------Leaf ((S\NP)/NP) --------------------------Leaf (((S/NP)\(S/NP))/(S/NP)) --------Leaf (NP/N) ------------Leaf N --------------------> NP -------------------->T (S/(S\NP)) -------------Leaf ((S\NP)/NP) --------------------------------->B (S/NP) -----------------------------------------------------------> ((S/NP)\(S/NP)) --------Leaf (NP/N) -------Leaf (N/N) -----Leaf N ------------> N --------------------> NP -------------------- S nltk-3.1/nltk/test/chat80.doctest0000644000076500000240000002056512607224144016473 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ======= Chat-80 ======= Chat-80 was a natural language system which allowed the user to interrogate a Prolog knowledge base in the domain of world geography. It was developed in the early '80s by Warren and Pereira; see ``_ for a description and ``_ for the source files. The ``chat80`` module contains functions to extract data from the Chat-80 relation files ('the world database'), and convert then into a format that can be incorporated in the FOL models of ``nltk.sem.evaluate``. The code assumes that the Prolog input files are available in the NLTK corpora directory. The Chat-80 World Database consists of the following files:: world0.pl rivers.pl cities.pl countries.pl contain.pl borders.pl This module uses a slightly modified version of ``world0.pl``, in which a set of Prolog rules have been omitted. The modified file is named ``world1.pl``. Currently, the file ``rivers.pl`` is not read in, since it uses a list rather than a string in the second field. Reading Chat-80 Files ===================== Chat-80 relations are like tables in a relational database. The relation acts as the name of the table; the first argument acts as the 'primary key'; and subsequent arguments are further fields in the table. In general, the name of the table provides a label for a unary predicate whose extension is all the primary keys. For example, relations in ``cities.pl`` are of the following form:: 'city(athens,greece,1368).' Here, ``'athens'`` is the key, and will be mapped to a member of the unary predicate *city*. By analogy with NLTK corpora, ``chat80`` defines a number of 'items' which correspond to the relations. >>> from nltk.sem import chat80 >>> print(chat80.items) # doctest: +ELLIPSIS ('borders', 'circle_of_lat', 'circle_of_long', 'city', ...) The fields in the table are mapped to binary predicates. The first argument of the predicate is the primary key, while the second argument is the data in the relevant field. Thus, in the above example, the third field is mapped to the binary predicate *population_of*, whose extension is a set of pairs such as ``'(athens, 1368)'``. An exception to this general framework is required by the relations in the files ``borders.pl`` and ``contains.pl``. These contain facts of the following form:: 'borders(albania,greece).' 'contains0(africa,central_africa).' We do not want to form a unary concept out the element in the first field of these records, and we want the label of the binary relation just to be ``'border'``/``'contain'`` respectively. In order to drive the extraction process, we use 'relation metadata bundles' which are Python dictionaries such as the following:: city = {'label': 'city', 'closures': [], 'schema': ['city', 'country', 'population'], 'filename': 'cities.pl'} According to this, the file ``city['filename']`` contains a list of relational tuples (or more accurately, the corresponding strings in Prolog form) whose predicate symbol is ``city['label']`` and whose relational schema is ``city['schema']``. The notion of a ``closure`` is discussed in the next section. Concepts ======== In order to encapsulate the results of the extraction, a class of ``Concept``\ s is introduced. A ``Concept`` object has a number of attributes, in particular a ``prefLabel``, an arity and ``extension``. >>> c1 = chat80.Concept('dog', arity=1, extension=set(['d1', 'd2'])) >>> print(c1) Label = 'dog' Arity = 1 Extension = ['d1', 'd2'] The ``extension`` attribute makes it easier to inspect the output of the extraction. >>> schema = ['city', 'country', 'population'] >>> concepts = chat80.clause2concepts('cities.pl', 'city', schema) >>> concepts [Concept('city'), Concept('country_of'), Concept('population_of')] >>> for c in concepts: # doctest: +NORMALIZE_WHITESPACE ... print("%s:\n\t%s" % (c.prefLabel, c.extension[:4])) city: ['athens', 'bangkok', 'barcelona', 'berlin'] country_of: [('athens', 'greece'), ('bangkok', 'thailand'), ('barcelona', 'spain'), ('berlin', 'east_germany')] population_of: [('athens', '1368'), ('bangkok', '1178'), ('barcelona', '1280'), ('berlin', '3481')] In addition, the ``extension`` can be further processed: in the case of the ``'border'`` relation, we check that the relation is **symmetric**, and in the case of the ``'contain'`` relation, we carry out the **transitive closure**. The closure properties associated with a concept is indicated in the relation metadata, as indicated earlier. >>> borders = set([('a1', 'a2'), ('a2', 'a3')]) >>> c2 = chat80.Concept('borders', arity=2, extension=borders) >>> print(c2) Label = 'borders' Arity = 2 Extension = [('a1', 'a2'), ('a2', 'a3')] >>> c3 = chat80.Concept('borders', arity=2, closures=['symmetric'], extension=borders) >>> c3.close() >>> print(c3) Label = 'borders' Arity = 2 Extension = [('a1', 'a2'), ('a2', 'a1'), ('a2', 'a3'), ('a3', 'a2')] The ``extension`` of a ``Concept`` object is then incorporated into a ``Valuation`` object. Persistence =========== The functions ``val_dump`` and ``val_load`` are provided to allow a valuation to be stored in a persistent database and re-loaded, rather than having to be re-computed each time. Individuals and Lexical Items ============================= As well as deriving relations from the Chat-80 data, we also create a set of individual constants, one for each entity in the domain. The individual constants are string-identical to the entities. For example, given a data item such as ``'zloty'``, we add to the valuation a pair ``('zloty', 'zloty')``. In order to parse English sentences that refer to these entities, we also create a lexical item such as the following for each individual constant:: PropN[num=sg, sem=<\P.(P zloty)>] -> 'Zloty' The set of rules is written to the file ``chat_pnames.fcfg`` in the current directory. SQL Query ========= The ``city`` relation is also available in RDB form and can be queried using SQL statements. >>> import nltk >>> q = "SELECT City, Population FROM city_table WHERE Country = 'china' and Population > 1000" >>> for answer in chat80.sql_query('corpora/city_database/city.db', q): ... print("%-10s %4s" % answer) canton 1496 chungking 1100 mukden 1551 peking 2031 shanghai 5407 tientsin 1795 The (deliberately naive) grammar ``sql.fcfg`` translates from English to SQL: >>> nltk.data.show_cfg('grammars/book_grammars/sql0.fcfg') % start S S[SEM=(?np + WHERE + ?vp)] -> NP[SEM=?np] VP[SEM=?vp] VP[SEM=(?v + ?pp)] -> IV[SEM=?v] PP[SEM=?pp] VP[SEM=(?v + ?ap)] -> IV[SEM=?v] AP[SEM=?ap] NP[SEM=(?det + ?n)] -> Det[SEM=?det] N[SEM=?n] PP[SEM=(?p + ?np)] -> P[SEM=?p] NP[SEM=?np] AP[SEM=?pp] -> A[SEM=?a] PP[SEM=?pp] NP[SEM='Country="greece"'] -> 'Greece' NP[SEM='Country="china"'] -> 'China' Det[SEM='SELECT'] -> 'Which' | 'What' N[SEM='City FROM city_table'] -> 'cities' IV[SEM=''] -> 'are' A[SEM=''] -> 'located' P[SEM=''] -> 'in' Given this grammar, we can express, and then execute, queries in English. >>> cp = nltk.parse.load_parser('grammars/book_grammars/sql0.fcfg') >>> query = 'What cities are in China' >>> for tree in cp.parse(query.split()): ... answer = tree.label()['SEM'] ... q = " ".join(answer) ... print(q) ... SELECT City FROM city_table WHERE Country="china" >>> rows = chat80.sql_query('corpora/city_database/city.db', q) >>> for r in rows: print("%s" % r, end=' ') canton chungking dairen harbin kowloon mukden peking shanghai sian tientsin Using Valuations ----------------- In order to convert such an extension into a valuation, we use the ``make_valuation()`` method; setting ``read=True`` creates and returns a new ``Valuation`` object which contains the results. >>> val = chat80.make_valuation(concepts, read=True) >>> 'calcutta' in val['city'] True >>> [town for (town, country) in val['country_of'] if country == 'india'] ['bombay', 'calcutta', 'delhi', 'hyderabad', 'madras'] >>> dom = val.domain >>> g = nltk.sem.Assignment(dom) >>> m = nltk.sem.Model(dom, val) >>> m.evaluate(r'population_of(jakarta, 533)', g) True nltk-3.1/nltk/test/childes.doctest0000644000076500000240000002157212574600335017021 0ustar sbstaff00000000000000======================= CHILDES Corpus Readers ======================= Read the XML version of the CHILDES corpus. How to use CHILDESCorpusReader ============================== Read the CHILDESCorpusReader class and read the CHILDES corpus saved in the nltk_data directory. >>> import nltk >>> from nltk.corpus.reader import CHILDESCorpusReader >>> corpus_root = nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/') Reading files in the Valian corpus (Valian, 1991). >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml') >>> valian.fileids() ['Valian/01a.xml', 'Valian/01b.xml', 'Valian/02a.xml', 'Valian/02b.xml',... Count the number of files >>> len(valian.fileids()) 43 Printing properties of the corpus files. >>> corpus_data = valian.corpus(valian.fileids()) >>> print(corpus_data[0]['Lang']) eng >>> for key in sorted(corpus_data[0].keys()): ... print(key, ": ", corpus_data[0][key]) Corpus : valian Date : 1986-03-04 Id : 01a Lang : eng Version : 2.0.1 {http://www.w3.org/2001/XMLSchema-instance}schemaLocation : http://www.talkbank.org/ns/talkbank http://talkbank.org/software/talkbank.xsd Printing information of participants of the corpus. The most common codes for the participants are 'CHI' (target child), 'MOT' (mother), and 'INV' (investigator). >>> corpus_participants = valian.participants(valian.fileids()) >>> for this_corpus_participants in corpus_participants[:2]: ... for key in sorted(this_corpus_participants.keys()): ... dct = this_corpus_participants[key] ... print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())]) CHI : [('age', 'P2Y1M3D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')] INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')] MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')] CHI : [('age', 'P2Y1M12D'), ('group', 'normal'), ('id', 'CHI'), ('language', 'eng'), ('role', 'Target_Child'), ('sex', 'female')] INV : [('id', 'INV'), ('language', 'eng'), ('role', 'Investigator')] MOT : [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother')] printing words. >>> valian.words('Valian/01a.xml') ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ... printing sentences. >>> valian.sents('Valian/01a.xml') [['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', 'March', 'fourth', 'I', 'believe', 'and', 'when', 'was', "Parent's", 'birthday'], ["Child's"], ['oh', "I'm", 'sorry'], ["that's", 'okay'], ... You can specify the participants with the argument *speaker*. >>> valian.words('Valian/01a.xml',speaker=['INV']) ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', ... >>> valian.words('Valian/01a.xml',speaker=['MOT']) ["Child's", "that's", 'okay', 'February', 'first', 'nineteen', ... >>> valian.words('Valian/01a.xml',speaker=['CHI']) ['tape', 'it', 'up', 'and', 'two', 'tape', 'players', 'have',... tagged_words() and tagged_sents() return the usual (word,pos) tuple lists. POS tags in the CHILDES are automatically assigned by MOR and POST programs (MacWhinney, 2000). >>> valian.tagged_words('Valian/01a.xml')[:30] [('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'), ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'), ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'), ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n'), ("Child's", 'n:prop'), ('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj'), ("that's", 'pro:dem'), ('okay', 'adj'), ('February', 'n:prop'), ('first', 'adj'), ('nineteen', 'det:num'), ('eighty', 'det:num'), ('four', 'det:num')] >>> valian.tagged_sents('Valian/01a.xml')[:10] [[('at', 'prep'), ('Parent', 'n:prop'), ("Lastname's", 'n:prop'), ('house', 'n'), ('with', 'prep'), ('Child', 'n:prop'), ('Lastname', 'n:prop'), ('and', 'coord'), ('it', 'pro'), ('is', 'v:cop'), ('March', 'n:prop'), ('fourth', 'adj'), ('I', 'pro:sub'), ('believe', 'v'), ('and', 'coord'), ('when', 'adv:wh'), ('was', 'v:cop'), ("Parent's", 'n:prop'), ('birthday', 'n')], [("Child's", 'n:prop')], [('oh', 'co'), ("I'm", 'pro:sub'), ('sorry', 'adj')], [("that's", 'pro:dem'), ('okay', 'adj')], [('February', 'n:prop'), ('first', 'adj'), ('nineteen', 'det:num'), ('eighty', 'det:num'), ('four', 'det:num')], [('great', 'adj')], [('and', 'coord'), ("she's", 'pro:sub'), ('two', 'det:num'), ('years', 'n'), ('old', 'adj')], [('correct', 'adj')], [('okay', 'co')], [('she', 'pro:sub'), ('just', 'adv:int'), ('turned', 'part'), ('two', 'det:num'), ('a', 'det'), ('month', 'n'), ('ago', 'adv')]] When the argument *stem* is true, the word stems (e.g., 'is' -> 'be-3PS') are used instread of the original words. >>> valian.words('Valian/01a.xml')[:30] ['at', 'Parent', "Lastname's", 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'is', ... >>> valian.words('Valian/01a.xml',stem=True)[:30] ['at', 'Parent', 'Lastname', 's', 'house', 'with', 'Child', 'Lastname', 'and', 'it', 'be-3S', ... When the argument *replace* is true, the replaced words are used instread of the original words. >>> valian.words('Valian/01a.xml',speaker='CHI')[247] 'tikteat' >>> valian.words('Valian/01a.xml',speaker='CHI',replace=True)[247] 'trick' When the argument *relation* is true, the relational relationships in the sentence are returned. See Sagae et al. (2010) for details of the relational structure adopted in the CHILDES. >>> valian.words('Valian/01a.xml',relation=True)[:10] [[('at', 'prep', '1|0|ROOT'), ('Parent', 'n', '2|5|VOC'), ('Lastname', 'n', '3|5|MOD'), ('s', 'poss', '4|5|MOD'), ('house', 'n', '5|1|POBJ'), ('with', 'prep', '6|1|JCT'), ('Child', 'n', '7|8|NAME'), ('Lastname', 'n', '8|6|POBJ'), ('and', 'coord', '9|8|COORD'), ('it', 'pro', '10|11|SUBJ'), ('be-3S', 'v', '11|9|COMP'), ('March', 'n', '12|11|PRED'), ('fourth', 'adj', '13|12|MOD'), ('I', 'pro', '15|16|SUBJ'), ('believe', 'v', '16|14|ROOT'), ('and', 'coord', '18|17|ROOT'), ('when', 'adv', '19|20|PRED'), ('be-PAST', 'v', '20|18|COMP'), ('Parent', 'n', '21|23|MOD'), ('s', 'poss', '22|23|MOD'), ('birth', 'n', '23|20|SUBJ')], [('Child', 'n', '1|2|MOD'), ('s', 'poss', '2|0|ROOT')], [('oh', 'co', '1|4|COM'), ('I', 'pro', '3|4|SUBJ'), ('be', 'v', '4|0|ROOT'), ('sorry', 'adj', '5|4|PRED')], [('that', 'pro', '1|2|SUBJ'), ('be', 'v', '2|0|ROOT'), ('okay', 'adj', '3|2|PRED')], [('February', 'n', '1|6|VOC'), ('first', 'adj', '2|6|ENUM'), ('nineteen', 'det', '4|6|ENUM'), ('eighty', 'det', '5|6|ENUM'), ('four', 'det', '6|0|ROOT')], [('great', 'adj', '1|0|ROOT')], [('and', 'coord', '1|0|ROOT'), ('she', 'pro', '2|1|ROOT'), ('be', 'aux', '3|5|AUX'), ('two', 'det', '4|5|QUANT'), ('year-PL', 'n', '5|2|ROOT'), ('old', 'adj', '6|5|MOD')], [('correct', 'adj', '1|0|ROOT')], [('okay', 'co', '1|0|ROOT')], [('she', 'pro', '1|0|ROOT'), ('just', 'adv', '2|3|JCT'), ('turn-PERF', 'part', '3|1|XCOMP'), ('two', 'det', '4|6|QUANT'), ('a', 'det', '5|6|DET'), ('month', 'n', '6|3|OBJ'), ('ago', 'adv', '7|3|JCT')]] Printing age. When the argument *month* is true, the age information in the CHILDES format is converted into the number of months. >>> valian.age() ['P2Y1M3D', 'P2Y1M12D', 'P1Y9M21D', 'P1Y9M28D', 'P2Y1M23D', ... >>> valian.age('Valian/01a.xml') ['P2Y1M3D'] >>> valian.age('Valian/01a.xml',month=True) [25] Printing MLU. The criteria for the MLU computation is broadly based on Brown (1973). >>> valian.MLU() [2.3574660633484..., 2.292682926829..., 3.492857142857..., 2.961783439490..., 2.0842696629213..., 3.169811320754..., 3.137404580152..., 3.0578034682080..., 4.090163934426..., 3.488372093023..., 2.8773584905660..., 3.4792899408284..., 4.0111940298507..., 3.456790123456..., 4.487603305785..., 4.007936507936..., 5.25, 5.154696132596..., ...] >>> valian.MLU('Valian/01a.xml') [2.35746606334...] Basic stuff ============================== Count the number of words and sentences of each file. >>> valian = CHILDESCorpusReader(corpus_root, 'Valian/.*.xml') >>> for this_file in valian.fileids()[:6]: ... print(valian.corpus(this_file)[0]['Corpus'], valian.corpus(this_file)[0]['Id']) ... print("num of words: %i" % len(valian.words(this_file))) ... print("num of sents: %i" % len(valian.sents(this_file))) valian 01a num of words: 3606 num of sents: 1027 valian 01b num of words: 4376 num of sents: 1274 valian 02a num of words: 2673 num of sents: 801 valian 02b num of words: 5020 num of sents: 1583 valian 03a num of words: 2743 num of sents: 988 valian 03b num of words: 4409 num of sents: 1397 nltk-3.1/nltk/test/childes_fixt.py0000644000076500000240000000071312574600335017030 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import def setup_module(module): from nose import SkipTest import nltk.data try: nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/') except LookupError as e: print(e) raise SkipTest("The CHILDES corpus is not found. " "It should be manually downloaded and saved/unpacked " "to [NLTK_Data_Dir]/corpora/childes/") nltk-3.1/nltk/test/chunk.doctest0000644000076500000240000002564312607224144016516 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ========== Chunking ========== >>> from nltk.chunk import * >>> from nltk.chunk.util import * >>> from nltk.chunk.regexp import * >>> from nltk import Tree >>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./." >>> gold_chunked_text = tagstr2tree(tagged_text) >>> unchunked_text = gold_chunked_text.flatten() Chunking uses a special regexp syntax for rules that delimit the chunks. These rules must be converted to 'regular' regular expressions before a sentence can be chunked. >>> tag_pattern = "
    ?*" >>> regexp_pattern = tag_pattern2re_pattern(tag_pattern) >>> regexp_pattern '(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)' Construct some new chunking rules. >>> chunk_rule = ChunkRule("<.*>+", "Chunk everything") >>> chink_rule = ChinkRule("", "Chink on verbs/prepositions") >>> split_rule = SplitRule("
    ", "
    ", ... "Split successive determiner/noun pairs") Create and score a series of chunk parsers, successively more complex. >>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print(chunked_text) (S (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.)) >>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> print(chunkscore.precision()) 0.0 >>> print(chunkscore.recall()) 0.0 >>> print(chunkscore.f_measure()) 0 >>> for chunk in sorted(chunkscore.missed()): print(chunk) (NP The/DT cat/NN) (NP the/DT dog/NN) (NP the/DT mat/NN) >>> for chunk in chunkscore.incorrect(): print(chunk) (NP The/DT cat/NN sat/VBD on/IN the/DT mat/NN the/DT dog/NN chewed/VBD ./.) >>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule], ... chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text) >>> print(chunked_text) (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN the/DT dog/NN) chewed/VBD ./.) >>> assert chunked_text == chunk_parser.parse(list(unchunked_text)) >>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 0.5 >>> print(chunkscore.recall()) 0.33333333... >>> print(chunkscore.f_measure()) 0.4 >>> for chunk in sorted(chunkscore.missed()): print(chunk) (NP the/DT dog/NN) (NP the/DT mat/NN) >>> for chunk in chunkscore.incorrect(): print(chunk) (NP the/DT mat/NN the/DT dog/NN) >>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule, split_rule], ... chunk_label='NP') >>> chunked_text = chunk_parser.parse(unchunked_text, trace=True) # Input:
    <.> # Chunk everything: {
    <.>} # Chink on verbs/prepositions: {
    } {
    } <.> # Split successive determiner/noun pairs: {
    } {
    }{
    } <.> >>> print(chunked_text) (S (NP The/DT cat/NN) sat/VBD on/IN (NP the/DT mat/NN) (NP the/DT dog/NN) chewed/VBD ./.) >>> chunkscore = ChunkScore() >>> chunkscore.score(gold_chunked_text, chunked_text) >>> chunkscore.precision() 1.0 >>> chunkscore.recall() 1.0 >>> chunkscore.f_measure() 1.0 >>> chunkscore.missed() [] >>> chunkscore.incorrect() [] >>> chunk_parser.rules() # doctest: +NORMALIZE_WHITESPACE [+'>, '>, ', '
    '>] Printing parsers: >>> print(repr(chunk_parser)) >>> print(chunk_parser) RegexpChunkParser with 3 rules: Chunk everything +'> Chink on verbs/prepositions '> Split successive determiner/noun pairs ', '
    '> Regression Tests ~~~~~~~~~~~~~~~~ ChunkParserI ------------ `ChunkParserI` is an abstract interface -- it is not meant to be instantiated directly. >>> ChunkParserI().parse([]) Traceback (most recent call last): . . . NotImplementedError ChunkString ----------- ChunkString can be built from a tree of tagged tuples, a tree of trees, or a mixed list of both: >>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)]) >>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])]) >>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])]) >>> ChunkString(t1) '> >>> ChunkString(t2) '> >>> ChunkString(t3) '> Other values generate an error: >>> ChunkString(Tree('S', ['x'])) Traceback (most recent call last): . . . ValueError: chunk structures must contain tagged tokens or trees The `str()` for a chunk string adds spaces to it, which makes it line up with `str()` output for other chunk strings over the same underlying input. >>> cs = ChunkString(t1) >>> print(cs) >>> cs.xform('', '{}') >>> print(cs) {} The `_verify()` method makes sure that our transforms don't corrupt the chunk string. By setting debug_level=2, `_verify()` will be called at the end of every call to `xform`. >>> cs = ChunkString(t1, debug_level=3) >>> # tag not marked with <...>: >>> cs.xform('', 't3') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: t3 >>> # brackets not balanced: >>> cs.xform('', '{') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: { >>> # nested brackets: >>> cs.xform('', '{{}}') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: {{}} >>> # modified tags: >>> cs.xform('', '') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed >>> # added tags: >>> cs.xform('', '') Traceback (most recent call last): . . . ValueError: Transformation generated invalid chunkstring: tag changed Chunking Rules -------------- Test the different rule constructors & __repr__ methods: >>> r1 = RegexpChunkRule(''+ChunkString.IN_CHINK_PATTERN, ... '{}', 'chunk and ') >>> r2 = RegexpChunkRule(re.compile(''+ChunkString.IN_CHINK_PATTERN), ... '{}', 'chunk and ') >>> r3 = ChunkRule('', 'chunk and ') >>> r4 = ChinkRule('', 'chink and ') >>> r5 = UnChunkRule('', 'unchunk and ') >>> r6 = MergeRule('', '', 'merge w/ ') >>> r7 = SplitRule('', '', 'split from ') >>> r8 = ExpandLeftRule('', '', 'expand left ') >>> r9 = ExpandRightRule('', '', 'expand right ') >>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9: ... print(rule) (?=[^\\}]*(\\{|$))'->'{}'> (?=[^\\}]*(\\{|$))'->'{}'> '> '> '> ', ''> ', ''> ', ''> ', ''> `tag_pattern2re_pattern()` complains if the tag pattern looks problematic: >>> tag_pattern2re_pattern('{}') Traceback (most recent call last): . . . ValueError: Bad tag pattern: '{}' RegexpChunkParser ----------------- A warning is printed when parsing an empty sentence: >>> parser = RegexpChunkParser([ChunkRule('', '')]) >>> parser.parse(Tree('S', [])) Warning: parsing empty text Tree('S', []) RegexpParser ------------ >>> parser = RegexpParser(''' ... NP: {
    ? * *} # NP ... P: {} # Preposition ... V: {} # Verb ... PP: {

    } # PP -> P NP ... VP: { *} # VP -> V (NP|PP)* ... ''') >>> print(repr(parser)) >>> print(parser) chunk.RegexpParser with 5 stages: RegexpChunkParser with 1 rules: NP ? * *'> RegexpChunkParser with 1 rules: Preposition '> RegexpChunkParser with 1 rules: Verb '> RegexpChunkParser with 1 rules: PP -> P NP '> RegexpChunkParser with 1 rules: VP -> V (NP|PP)* *'> >>> print(parser.parse(unchunked_text, trace=True)) # Input:

    <.> # NP: {
    } {
    }{
    } <.> # Input: <.> # Preposition: {} <.> # Input:

    <.> # Verb: {}

    {} <.> # Input:

    <.> # PP -> P NP: {

    } <.> # Input: <.> # VP -> V (NP|PP)*: { }{} <.> (S (NP The/DT cat/NN) (VP (V sat/VBD) (PP (P on/IN) (NP the/DT mat/NN)) (NP the/DT dog/NN)) (VP (V chewed/VBD)) ./.) Test parsing of other rule types: >>> print(RegexpParser(''' ... X: ... }{ # chink rule ... }{ # split rule ... {} # merge rule ... {} # chunk rule w/ context ... ''')) chunk.RegexpParser with 1 stages: RegexpChunkParser with 4 rules: chink rule '> split rule ', ''> merge rule ', ''> chunk rule w/ context ', '', ''> Illegal patterns give an error message: >>> print(RegexpParser('X: {} {}')) Traceback (most recent call last): . . . ValueError: Illegal chunk pattern: {} {} nltk-3.1/nltk/test/classify.doctest0000644000076500000240000001536212607224144017220 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ============= Classifiers ============= Classifiers label tokens with category labels (or *class labels*). Typically, labels are represented with strings (such as ``"health"`` or ``"sports"``. In NLTK, classifiers are defined using classes that implement the `ClassifyI` interface: >>> import nltk >>> nltk.usage(nltk.classify.ClassifierI) ClassifierI supports the following operations: - self.classify(featureset) - self.classify_many(featuresets) - self.labels() - self.prob_classify(featureset) - self.prob_classify_many(featuresets) NLTK defines several classifier classes: - `ConditionalExponentialClassifier` - `DecisionTreeClassifier` - `MaxentClassifier` - `NaiveBayesClassifier` - `WekaClassifier` Classifiers are typically created by training them on a training corpus. Regression Tests ~~~~~~~~~~~~~~~~ We define a very simple training corpus with 3 binary features: ['a', 'b', 'c'], and are two labels: ['x', 'y']. We use a simple feature set so that the correct answers can be calculated analytically (although we haven't done this yet for all tests). >>> train = [ ... (dict(a=1,b=1,c=1), 'y'), ... (dict(a=1,b=1,c=1), 'x'), ... (dict(a=1,b=1,c=0), 'y'), ... (dict(a=0,b=1,c=1), 'x'), ... (dict(a=0,b=1,c=1), 'y'), ... (dict(a=0,b=0,c=1), 'y'), ... (dict(a=0,b=1,c=0), 'x'), ... (dict(a=0,b=0,c=0), 'x'), ... (dict(a=0,b=1,c=1), 'y'), ... ] >>> test = [ ... (dict(a=1,b=0,c=1)), # unseen ... (dict(a=1,b=0,c=0)), # unseen ... (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x ... (dict(a=0,b=1,c=0)), # seen 1 time, label=x ... ] Test the Naive Bayes classifier: >>> classifier = nltk.classify.NaiveBayesClassifier.train(train) >>> sorted(classifier.labels()) ['x', 'y'] >>> classifier.classify_many(test) ['y', 'x', 'y', 'x'] >>> for pdist in classifier.prob_classify_many(test): ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y'))) 0.3203 0.6797 0.5857 0.4143 0.3792 0.6208 0.6470 0.3530 >>> classifier.show_most_informative_features() Most Informative Features c = 0 x : y = 2.0 : 1.0 c = 1 y : x = 1.5 : 1.0 a = 1 y : x = 1.4 : 1.0 b = 0 x : y = 1.2 : 1.0 a = 0 x : y = 1.2 : 1.0 b = 1 y : x = 1.1 : 1.0 Test the Decision Tree classifier: >>> classifier = nltk.classify.DecisionTreeClassifier.train( ... train, entropy_cutoff=0, ... support_cutoff=0) >>> sorted(classifier.labels()) ['x', 'y'] >>> print(classifier) c=0? .................................................. x a=0? ................................................ x a=1? ................................................ y c=1? .................................................. y >>> classifier.classify_many(test) ['y', 'y', 'y', 'x'] >>> for pdist in classifier.prob_classify_many(test): ... print('%.4f %.4f' % (pdist.prob('x'), pdist.prob('y'))) Traceback (most recent call last): . . . NotImplementedError Test SklearnClassifier, which requires the scikit-learn package. >>> from nltk.classify import SklearnClassifier >>> from sklearn.naive_bayes import BernoulliNB >>> from sklearn.svm import SVC >>> train_data = [({"a": 4, "b": 1, "c": 0}, "ham"), ... ({"a": 5, "b": 2, "c": 1}, "ham"), ... ({"a": 0, "b": 3, "c": 4}, "spam"), ... ({"a": 5, "b": 1, "c": 1}, "ham"), ... ({"a": 1, "b": 4, "c": 3}, "spam")] >>> classif = SklearnClassifier(BernoulliNB()).train(train_data) >>> test_data = [{"a": 3, "b": 2, "c": 1}, ... {"a": 0, "b": 3, "c": 7}] >>> classif.classify_many(test_data) ['ham', 'spam'] >>> classif = SklearnClassifier(SVC(), sparse=False).train(train_data) >>> classif.classify_many(test_data) ['ham', 'spam'] Test the Maximum Entropy classifier training algorithms; they should all generate the same results. >>> def print_maxent_test_header(): ... print(' '*11+''.join([' test[%s] ' % i ... for i in range(len(test))])) ... print(' '*11+' p(x) p(y)'*len(test)) ... print('-'*(11+15*len(test))) >>> def test_maxent(algorithm): ... print('%11s' % algorithm, end=' ') ... try: ... classifier = nltk.classify.MaxentClassifier.train( ... train, algorithm, trace=0, max_iter=1000) ... except Exception as e: ... print('Error: %r' % e) ... return ... ... for featureset in test: ... pdist = classifier.prob_classify(featureset) ... print('%8.2f%6.2f' % (pdist.prob('x'), pdist.prob('y')), end=' ') ... print() >>> print_maxent_test_header(); test_maxent('GIS'); test_maxent('IIS') test[0] test[1] test[2] test[3] p(x) p(y) p(x) p(y) p(x) p(y) p(x) p(y) ----------------------------------------------------------------------- GIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 IIS 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 >>> test_maxent('MEGAM'); test_maxent('TADM') # doctest: +SKIP MEGAM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 TADM 0.16 0.84 0.46 0.54 0.41 0.59 0.76 0.24 Regression tests for TypedMaxentFeatureEncoding ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> from nltk.classify import maxent >>> train = [ ... ({'a': 1, 'b': 1, 'c': 1}, 'y'), ... ({'a': 5, 'b': 5, 'c': 5}, 'x'), ... ({'a': 0.9, 'b': 0.9, 'c': 0.9}, 'y'), ... ({'a': 5.5, 'b': 5.4, 'c': 5.3}, 'x'), ... ({'a': 0.8, 'b': 1.2, 'c': 1}, 'y'), ... ({'a': 5.1, 'b': 4.9, 'c': 5.2}, 'x') ... ] >>> test = [ ... {'a': 1, 'b': 0.8, 'c': 1.2}, ... {'a': 5.2, 'b': 5.1, 'c': 5} ... ] >>> encoding = maxent.TypedMaxentFeatureEncoding.train( ... train, count_cutoff=3, alwayson_features=True) >>> classifier = maxent.MaxentClassifier.train( ... train, bernoulli=False, encoding=encoding, trace=0) >>> classifier.classify_many(test) ['y', 'x'] nltk-3.1/nltk/test/classify_fixt.py0000644000076500000240000000042112574600335017226 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import # most of classify.doctest requires numpy def setup_module(module): from nose import SkipTest try: import numpy except ImportError: raise SkipTest("classify.doctest requires numpy")nltk-3.1/nltk/test/collocations.doctest0000644000076500000240000002572512607224144020100 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ============== Collocations ============== Overview ~~~~~~~~ Collocations are expressions of multiple words which commonly co-occur. For example, the top ten bigram collocations in Genesis are listed below, as measured using Pointwise Mutual Information. >>> import nltk >>> from nltk.collocations import * >>> bigram_measures = nltk.collocations.BigramAssocMeasures() >>> trigram_measures = nltk.collocations.TrigramAssocMeasures() >>> finder = BigramCollocationFinder.from_words( ... nltk.corpus.genesis.words('english-web.txt')) >>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE [(u'Allon', u'Bacuth'), (u'Ashteroth', u'Karnaim'), (u'Ben', u'Ammi'), (u'En', u'Mishpat'), (u'Jegar', u'Sahadutha'), (u'Salt', u'Sea'), (u'Whoever', u'sheds'), (u'appoint', u'overseers'), (u'aromatic', u'resin'), (u'cutting', u'instrument')] While these words are highly collocated, the expressions are also very infrequent. Therefore it is useful to apply filters, such as ignoring all bigrams which occur less than three times in the corpus: >>> finder.apply_freq_filter(3) >>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE [(u'Beer', u'Lahai'), (u'Lahai', u'Roi'), (u'gray', u'hairs'), (u'Most', u'High'), (u'ewe', u'lambs'), (u'many', u'colors'), (u'burnt', u'offering'), (u'Paddan', u'Aram'), (u'east', u'wind'), (u'living', u'creature')] We may similarly find collocations among tagged words: >>> finder = BigramCollocationFinder.from_words( ... nltk.corpus.brown.tagged_words('ca01', tagset='universal')) >>> finder.nbest(bigram_measures.pmi, 5) # doctest: +NORMALIZE_WHITESPACE [(('1,119', 'NUM'), ('votes', 'NOUN')), (('1962', 'NUM'), ("governor's", 'NOUN')), (('637', 'NUM'), ('E.', 'NOUN')), (('Alpharetta', 'NOUN'), ('prison', 'NOUN')), (('Bar', 'NOUN'), ('Association', 'NOUN'))] Or tags alone: >>> finder = BigramCollocationFinder.from_words(t for w, t in ... nltk.corpus.brown.tagged_words('ca01', tagset='universal')) >>> finder.nbest(bigram_measures.pmi, 10) # doctest: +NORMALIZE_WHITESPACE [('PRT', 'VERB'), ('PRON', 'VERB'), ('ADP', 'DET'), ('.', 'PRON'), ('DET', 'ADJ'), ('CONJ', 'PRON'), ('ADP', 'NUM'), ('NUM', '.'), ('ADV', 'ADV'), ('VERB', 'ADV')] Or spanning intervening words: >>> finder = BigramCollocationFinder.from_words( ... nltk.corpus.genesis.words('english-web.txt'), ... window_size = 20) >>> finder.apply_freq_filter(2) >>> ignored_words = nltk.corpus.stopwords.words('english') >>> finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) >>> finder.nbest(bigram_measures.likelihood_ratio, 10) # doctest: +NORMALIZE_WHITESPACE [(u'chief', u'chief'), (u'became', u'father'), (u'years', u'became'), (u'hundred', u'years'), (u'lived', u'became'), (u'king', u'king'), (u'lived', u'years'), (u'became', u'became'), (u'chief', u'chiefs'), (u'hundred', u'became')] Finders ~~~~~~~ The collocations package provides collocation finders which by default consider all ngrams in a text as candidate collocations: >>> text = "I do not like green eggs and ham, I do not like them Sam I am!" >>> tokens = nltk.wordpunct_tokenize(text) >>> finder = BigramCollocationFinder.from_words(tokens) >>> scored = finder.score_ngrams(bigram_measures.raw_freq) >>> sorted(bigram for bigram, score in scored) # doctest: +NORMALIZE_WHITESPACE [(',', 'I'), ('I', 'am'), ('I', 'do'), ('Sam', 'I'), ('am', '!'), ('and', 'ham'), ('do', 'not'), ('eggs', 'and'), ('green', 'eggs'), ('ham', ','), ('like', 'green'), ('like', 'them'), ('not', 'like'), ('them', 'Sam')] We could otherwise construct the collocation finder from manually-derived FreqDists: >>> word_fd = nltk.FreqDist(tokens) >>> bigram_fd = nltk.FreqDist(nltk.bigrams(tokens)) >>> finder = BigramCollocationFinder(word_fd, bigram_fd) >>> scored == finder.score_ngrams(bigram_measures.raw_freq) True A similar interface is provided for trigrams: >>> finder = TrigramCollocationFinder.from_words(tokens) >>> scored = finder.score_ngrams(trigram_measures.raw_freq) >>> set(trigram for trigram, score in scored) == set(nltk.trigrams(tokens)) True We may want to select only the top n results: >>> sorted(finder.nbest(trigram_measures.raw_freq, 2)) [('I', 'do', 'not'), ('do', 'not', 'like')] Alternatively, we can select those above a minimum score value: >>> sorted(finder.above_score(trigram_measures.raw_freq, ... 1.0 / len(tuple(nltk.trigrams(tokens))))) [('I', 'do', 'not'), ('do', 'not', 'like')] Now spanning intervening words: >>> finder = TrigramCollocationFinder.from_words(tokens) >>> finder = TrigramCollocationFinder.from_words(tokens, window_size=4) >>> sorted(finder.nbest(trigram_measures.raw_freq, 4)) [('I', 'do', 'like'), ('I', 'do', 'not'), ('I', 'not', 'like'), ('do', 'not', 'like')] A closer look at the finder's ngram frequencies: >>> sorted(finder.ngram_fd.items(), key=lambda t: (-t[1], t[0]))[:10] # doctest: +NORMALIZE_WHITESPACE [(('I', 'do', 'like'), 2), (('I', 'do', 'not'), 2), (('I', 'not', 'like'), 2), (('do', 'not', 'like'), 2), ((',', 'I', 'do'), 1), ((',', 'I', 'not'), 1), ((',', 'do', 'not'), 1), (('I', 'am', '!'), 1), (('Sam', 'I', '!'), 1), (('Sam', 'I', 'am'), 1)] Filtering candidates ~~~~~~~~~~~~~~~~~~~~ All the ngrams in a text are often too many to be useful when finding collocations. It is generally useful to remove some words or punctuation, and to require a minimum frequency for candidate collocations. Given our sample text above, if we remove all trigrams containing personal pronouns from candidature, score_ngrams should return 6 less results, and 'do not like' will be the only candidate which occurs more than once: >>> finder = TrigramCollocationFinder.from_words(tokens) >>> len(finder.score_ngrams(trigram_measures.raw_freq)) 14 >>> finder.apply_word_filter(lambda w: w in ('I', 'me')) >>> len(finder.score_ngrams(trigram_measures.raw_freq)) 8 >>> sorted(finder.above_score(trigram_measures.raw_freq, ... 1.0 / len(tuple(nltk.trigrams(tokens))))) [('do', 'not', 'like')] Sometimes a filter is a function on the whole ngram, rather than each word, such as if we may permit 'and' to appear in the middle of a trigram, but not on either edge: >>> finder.apply_ngram_filter(lambda w1, w2, w3: 'and' in (w1, w3)) >>> len(finder.score_ngrams(trigram_measures.raw_freq)) 6 Finally, it is often important to remove low frequency candidates, as we lack sufficient evidence about their significance as collocations: >>> finder.apply_freq_filter(2) >>> len(finder.score_ngrams(trigram_measures.raw_freq)) 1 Association measures ~~~~~~~~~~~~~~~~~~~~ A number of measures are available to score collocations or other associations. The arguments to measure functions are marginals of a contingency table, in the bigram case (n_ii, (n_ix, n_xi), n_xx):: w1 ~w1 ------ ------ w2 | n_ii | n_oi | = n_xi ------ ------ ~w2 | n_io | n_oo | ------ ------ = n_ix TOTAL = n_xx We test their calculation using some known values presented in Manning and Schutze's text and other papers. Student's t: examples from Manning and Schutze 5.3.2 >>> print('%0.4f' % bigram_measures.student_t(8, (15828, 4675), 14307668)) 0.9999 >>> print('%0.4f' % bigram_measures.student_t(20, (42, 20), 14307668)) 4.4721 Chi-square: examples from Manning and Schutze 5.3.3 >>> print('%0.2f' % bigram_measures.chi_sq(8, (15828, 4675), 14307668)) 1.55 >>> print('%0.0f' % bigram_measures.chi_sq(59, (67, 65), 571007)) 456400 Likelihood ratios: examples from Dunning, CL, 1993 >>> print('%0.2f' % bigram_measures.likelihood_ratio(110, (2552, 221), 31777)) 270.72 >>> print('%0.2f' % bigram_measures.likelihood_ratio(8, (13, 32), 31777)) 95.29 Pointwise Mutual Information: examples from Manning and Schutze 5.4 >>> print('%0.2f' % bigram_measures.pmi(20, (42, 20), 14307668)) 18.38 >>> print('%0.2f' % bigram_measures.pmi(20, (15019, 15629), 14307668)) 0.29 TODO: Find authoritative results for trigrams. Using contingency table values ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ While frequency counts make marginals readily available for collocation finding, it is common to find published contingency table values. The collocations package therefore provides a wrapper, ContingencyMeasures, which wraps an association measures class, providing association measures which take contingency values as arguments, (n_ii, n_io, n_oi, n_oo) in the bigram case. >>> from nltk.metrics import ContingencyMeasures >>> cont_bigram_measures = ContingencyMeasures(bigram_measures) >>> print('%0.2f' % cont_bigram_measures.likelihood_ratio(8, 5, 24, 31740)) 95.29 >>> print('%0.2f' % cont_bigram_measures.chi_sq(8, 15820, 4667, 14287173)) 1.55 Ranking and correlation ~~~~~~~~~~~~~~~~~~~~~~~ It is useful to consider the results of finding collocations as a ranking, and the rankings output using different association measures can be compared using the Spearman correlation coefficient. Ranks can be assigned to a sorted list of results trivially by assigning strictly increasing ranks to each result: >>> from nltk.metrics.spearman import * >>> results_list = ['item1', 'item2', 'item3', 'item4', 'item5'] >>> print(list(ranks_from_sequence(results_list))) [('item1', 0), ('item2', 1), ('item3', 2), ('item4', 3), ('item5', 4)] If scores are available for each result, we may allow sufficiently similar results (differing by no more than rank_gap) to be assigned the same rank: >>> results_scored = [('item1', 50.0), ('item2', 40.0), ('item3', 38.0), ... ('item4', 35.0), ('item5', 14.0)] >>> print(list(ranks_from_scores(results_scored, rank_gap=5))) [('item1', 0), ('item2', 1), ('item3', 1), ('item4', 1), ('item5', 4)] The Spearman correlation coefficient gives a number from -1.0 to 1.0 comparing two rankings. A coefficient of 1.0 indicates identical rankings; -1.0 indicates exact opposite rankings. >>> print('%0.1f' % spearman_correlation( ... ranks_from_sequence(results_list), ... ranks_from_sequence(results_list))) 1.0 >>> print('%0.1f' % spearman_correlation( ... ranks_from_sequence(reversed(results_list)), ... ranks_from_sequence(results_list))) -1.0 >>> results_list2 = ['item2', 'item3', 'item1', 'item5', 'item4'] >>> print('%0.1f' % spearman_correlation( ... ranks_from_sequence(results_list), ... ranks_from_sequence(results_list2))) 0.6 >>> print('%0.1f' % spearman_correlation( ... ranks_from_sequence(reversed(results_list)), ... ranks_from_sequence(results_list2))) -0.6 nltk-3.1/nltk/test/compat.doctest0000644000076500000240000000740612574600335016671 0ustar sbstaff00000000000000 ========================================= NLTK Python 2.x - 3.x Compatibility Layer ========================================= NLTK comes with a Python 2.x/3.x compatibility layer, nltk.compat (which is loosely based on `six `_):: >>> from nltk import compat >>> compat.PY3 False >>> compat.integer_types (, ) >>> compat.string_types (,) >>> # and so on @python_2_unicode_compatible ---------------------------- Under Python 2.x ``__str__`` and ``__repr__`` methods must return bytestrings. ``@python_2_unicode_compatible`` decorator allows writing these methods in a way compatible with Python 3.x: 1) wrap a class with this decorator, 2) define ``__str__`` and ``__repr__`` methods returning unicode text (that's what they must return under Python 3.x), and they would be fixed under Python 2.x to return byte strings:: >>> from nltk.compat import python_2_unicode_compatible >>> @python_2_unicode_compatible ... class Foo(object): ... def __str__(self): ... return u'__str__ is called' ... def __repr__(self): ... return u'__repr__ is called' >>> foo = Foo() >>> foo.__str__().__class__ >>> foo.__repr__().__class__ >>> print(foo) __str__ is called >>> foo __repr__ is called Original versions of ``__str__`` and ``__repr__`` are available as ``__unicode__`` and ``unicode_repr``:: >>> foo.__unicode__().__class__ >>> foo.unicode_repr().__class__ >>> unicode(foo) u'__str__ is called' >>> foo.unicode_repr() u'__repr__ is called' There is no need to wrap a subclass with ``@python_2_unicode_compatible`` if it doesn't override ``__str__`` and ``__repr__``:: >>> class Bar(Foo): ... pass >>> bar = Bar() >>> bar.__str__().__class__ However, if a subclass overrides ``__str__`` or ``__repr__``, wrap it again:: >>> class BadBaz(Foo): ... def __str__(self): ... return u'Baz.__str__' >>> baz = BadBaz() >>> baz.__str__().__class__ # this is incorrect! >>> @python_2_unicode_compatible ... class GoodBaz(Foo): ... def __str__(self): ... return u'Baz.__str__' >>> baz = GoodBaz() >>> baz.__str__().__class__ >>> baz.__unicode__().__class__ Applying ``@python_2_unicode_compatible`` to a subclass shouldn't break methods that was not overridden:: >>> baz.__repr__().__class__ >>> baz.unicode_repr().__class__ unicode_repr ------------ Under Python 3.x ``repr(unicode_string)`` doesn't have a leading "u" letter. ``nltk.compat.unicode_repr`` function may be used instead of ``repr`` and ``"%r" % obj`` to make the output more consistent under Python 2.x and 3.x:: >>> from nltk.compat import unicode_repr >>> print(repr(u"test")) u'test' >>> print(unicode_repr(u"test")) 'test' It may be also used to get an original unescaped repr (as unicode) of objects which class was fixed by ``@python_2_unicode_compatible`` decorator:: >>> @python_2_unicode_compatible ... class Foo(object): ... def __repr__(self): ... return u'' >>> foo = Foo() >>> repr(foo) '' >>> unicode_repr(foo) u'' For other objects it returns the same value as ``repr``:: >>> unicode_repr(5) '5' It may be a good idea to use ``unicode_repr`` instead of ``%r`` string formatting specifier inside ``__repr__`` or ``__str__`` methods of classes fixed by ``@python_2_unicode_compatible`` to make the output consistent between Python 2.x and 3.x. nltk-3.1/nltk/test/compat_fixt.py0000644000076500000240000000033312574600335016676 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import from nltk.compat import PY3 def setup_module(module): from nose import SkipTest if PY3: raise SkipTest("compat.doctest is for Python 2.x") nltk-3.1/nltk/test/corpus.doctest0000644000076500000240000026062312607224144016720 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ================ Corpus Readers ================ The `nltk.corpus` package defines a collection of *corpus reader* classes, which can be used to access the contents of a diverse set of corpora. The list of available corpora is given at: http://www.nltk.org/nltk_data/ Each corpus reader class is specialized to handle a specific corpus format. In addition, the `nltk.corpus` package automatically creates a set of corpus reader instances that can be used to access the corpora in the NLTK data package. Section `Corpus Reader Objects`_ ("Corpus Reader Objects") describes the corpus reader instances that can be used to read the corpora in the NLTK data package. Section `Corpus Reader Classes`_ ("Corpus Reader Classes") describes the corpus reader classes themselves, and discusses the issues involved in creating new corpus reader objects and new corpus reader classes. Section `Regression Tests`_ ("Regression Tests") contains regression tests for the corpus readers and associated functions and classes. .. contents:: **Table of Contents** :depth: 2 :backlinks: none --------------------- Corpus Reader Objects --------------------- Overview ======== NLTK includes a diverse set of corpora which can be read using the ``nltk.corpus`` package. Each corpus is accessed by means of a "corpus reader" object from ``nltk.corpus``: >>> import nltk.corpus >>> # The Brown corpus: >>> print(str(nltk.corpus.brown).replace('\\\\','/')) >>> # The Penn Treebank Corpus: >>> print(str(nltk.corpus.treebank).replace('\\\\','/')) >>> # The Name Genders Corpus: >>> print(str(nltk.corpus.names).replace('\\\\','/')) >>> # The Inaugural Address Corpus: >>> print(str(nltk.corpus.inaugural).replace('\\\\','/')) Most corpora consist of a set of files, each containing a document (or other pieces of text). A list of identifiers for these files is accessed via the ``fileids()`` method of the corpus reader: >>> nltk.corpus.treebank.fileids() # doctest: +ELLIPSIS ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...] >>> nltk.corpus.inaugural.fileids() # doctest: +ELLIPSIS ['1789-Washington.txt', '1793-Washington.txt', '1797-Adams.txt', ...] Each corpus reader provides a variety of methods to read data from the corpus, depending on the format of the corpus. For example, plaintext corpora support methods to read the corpus as raw text, a list of words, a list of sentences, or a list of paragraphs. >>> from nltk.corpus import inaugural >>> inaugural.raw('1789-Washington.txt') # doctest: +ELLIPSIS 'Fellow-Citizens of the Senate ...' >>> inaugural.words('1789-Washington.txt') ['Fellow', '-', 'Citizens', 'of', 'the', ...] >>> inaugural.sents('1789-Washington.txt') # doctest: +ELLIPSIS [['Fellow', '-', 'Citizens'...], ['Among', 'the', 'vicissitudes'...]...] >>> inaugural.paras('1789-Washington.txt') # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [[['Fellow', '-', 'Citizens'...]], [['Among', 'the', 'vicissitudes'...], ['On', 'the', 'one', 'hand', ',', 'I'...]...]...] Each of these reader methods may be given a single document's item name or a list of document item names. When given a list of document item names, the reader methods will concatenate together the contents of the individual documents. >>> l1 = len(inaugural.words('1789-Washington.txt')) >>> l2 = len(inaugural.words('1793-Washington.txt')) >>> l3 = len(inaugural.words(['1789-Washington.txt', '1793-Washington.txt'])) >>> print('%s+%s == %s' % (l1, l2, l3)) 1538+147 == 1685 If the reader methods are called without any arguments, they will typically load all documents in the corpus. >>> len(inaugural.words()) 145735 If a corpus contains a README file, it can be accessed with a ``readme()`` method: >>> inaugural.readme()[:32] 'C-Span Inaugural Address Corpus\n' Plaintext Corpora ================= Here are the first few words from each of NLTK's plaintext corpora: >>> nltk.corpus.abc.words() ['PM', 'denies', 'knowledge', 'of', 'AWB', ...] >>> nltk.corpus.genesis.words() [u'In', u'the', u'beginning', u'God', u'created', ...] >>> nltk.corpus.gutenberg.words(fileids='austen-emma.txt') ['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ...] >>> nltk.corpus.inaugural.words() ['Fellow', '-', 'Citizens', 'of', 'the', ...] >>> nltk.corpus.state_union.words() ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ...] >>> nltk.corpus.webtext.words() ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ...] Tagged Corpora ============== In addition to the plaintext corpora, NLTK's data package also contains a wide variety of annotated corpora. For example, the Brown Corpus is annotated with part-of-speech tags, and defines additional methods ``tagged_*()`` which words as `(word,tag)` tuples, rather than just bare word strings. >>> from nltk.corpus import brown >>> print(brown.words()) ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] >>> print(brown.tagged_words()) [('The', 'AT'), ('Fulton', 'NP-TL'), ...] >>> print(brown.sents()) # doctest: +ELLIPSIS [['The', 'Fulton', 'County'...], ['The', 'jury', 'further'...], ...] >>> print(brown.tagged_sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [[('The', 'AT'), ('Fulton', 'NP-TL')...], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR')...]...] >>> print(brown.paras(categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [[['It', 'is', 'not', 'news', 'that', 'Nathan', 'Milstein'...], ['Certainly', 'not', 'in', 'Orchestra', 'Hall', 'where'...]], [['There', 'was', 'about', 'that', 'song', 'something', ...], ['Not', 'the', 'noblest', 'performance', 'we', 'have', ...], ...], ...] >>> print(brown.tagged_paras(categories='reviews')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [[[('It', 'PPS'), ('is', 'BEZ'), ('not', '*'), ...], [('Certainly', 'RB'), ('not', '*'), ('in', 'IN'), ...]], [[('There', 'EX'), ('was', 'BEDZ'), ('about', 'IN'), ...], [('Not', '*'), ('the', 'AT'), ('noblest', 'JJT'), ...], ...], ...] Similarly, the Indian Language POS-Tagged Corpus includes samples of Indian text annotated with part-of-speech tags: >>> from nltk.corpus import indian >>> print(indian.words()) # doctest: +SKIP ['\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf\...', '\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', ...] >>> print(indian.tagged_words()) # doctest: +SKIP [('\xe0\xa6\xae\xe0\xa6\xb9\xe0\xa6\xbf...', 'NN'), ('\xe0\xa6\xb8\xe0\xa6\xa8\xe0\xa7\x8d\xe0...', 'NN'), ...] Several tagged corpora support access to a simplified, universal tagset, e.g. where all nouns tags are collapsed to a single category ``NOUN``: >>> print(brown.tagged_sents(tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [[('The', 'DET'), ('Fulton', 'NOUN'), ('County', 'NOUN'), ('Grand', 'ADJ'), ('Jury', 'NOUN'), ...], [('The', 'DET'), ('jury', 'NOUN'), ('further', 'ADV'), ('said', 'VERB'), ('in', 'ADP'), ...]...] >>> from nltk.corpus import conll2000, switchboard >>> print(conll2000.tagged_words(tagset='universal')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [('Confidence', 'NOUN'), ('in', 'ADP'), ...] Use ``nltk.app.pos_concordance()`` to access a GUI for searching tagged corpora. Chunked Corpora =============== The CoNLL corpora also provide chunk structures, which are encoded as flat trees. The CoNLL 2000 Corpus includes phrasal chunks; and the CoNLL 2002 Corpus includes named entity chunks. >>> from nltk.corpus import conll2000, conll2002 >>> print(conll2000.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [['Confidence', 'in', 'the', 'pound', 'is', 'widely', ...], ['Chancellor', 'of', 'the', 'Exchequer', ...], ...] >>> for tree in conll2000.chunked_sents()[:2]: ... print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE (S (NP Confidence/NN) (PP in/IN) (NP the/DT pound/NN) (VP is/VBZ widely/RB expected/VBN to/TO take/VB) (NP another/DT sharp/JJ dive/NN) if/IN ...) (S Chancellor/NNP (PP of/IN) (NP the/DT Exchequer/NNP) ...) >>> print(conll2002.sents()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [[u'Sao', u'Paulo', u'(', u'Brasil', u')', u',', ...], [u'-'], ...] >>> for tree in conll2002.chunked_sents()[:2]: ... print(tree) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE (S (LOC Sao/NC Paulo/VMI) (/Fpa (LOC Brasil/NC) )/Fpt ...) (S -/Fg) .. note:: Since the CONLL corpora do not contain paragraph break information, these readers do not support the ``para()`` method.) .. warning:: if you call the conll corpora reader methods without any arguments, they will return the contents of the entire corpus, *including* the 'test' portions of the corpus.) SemCor is a subset of the Brown corpus tagged with WordNet senses and named entities. Both kinds of lexical items include multiword units, which are encoded as chunks (senses and part-of-speech tags pertain to the entire chunk). >>> from nltk.corpus import semcor >>> semcor.words() ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] >>> semcor.chunks() [['The'], ['Fulton', 'County', 'Grand', 'Jury'], ...] >>> semcor.sents() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...], ['The', 'jury', 'further', 'said', ...], ...] >>> semcor.chunk_sents() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [[['The'], ['Fulton', 'County', 'Grand', 'Jury'], ['said'], ... ['.']], [['The'], ['jury'], ['further'], ['said'], ... ['.']], ...] >>> list(map(str, semcor.tagged_chunks(tag='both')[:3])) ['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", "(Lemma('state.v.01.say') (VB said))"] >>> [[str(c) for c in s] for s in semcor.tagged_sents(tag='both')[:2]] [['(DT The)', "(Lemma('group.n.01.group') (NE (NNP Fulton County Grand Jury)))", ... '(None .)'], ['(DT The)', ... '(None .)']] The IEER corpus is another chunked corpus. This corpus is unusual in that each corpus item contains multiple documents. (This reflects the fact that each corpus file contains multiple documents.) The IEER corpus defines the `parsed_docs` method, which returns the documents in a given item as `IEERDocument` objects: >>> from nltk.corpus import ieer >>> ieer.fileids() # doctest: +NORMALIZE_WHITESPACE ['APW_19980314', 'APW_19980424', 'APW_19980429', 'NYT_19980315', 'NYT_19980403', 'NYT_19980407'] >>> docs = ieer.parsed_docs('APW_19980314') >>> print(docs[0]) >>> print(docs[0].docno) APW19980314.0391 >>> print(docs[0].doctype) NEWS STORY >>> print(docs[0].date_time) 03/14/1998 10:36:00 >>> print(docs[0].headline) (DOCUMENT Kenyans protest tax hikes) >>> print(docs[0].text) # doctest: +ELLIPSIS (DOCUMENT (LOCATION NAIROBI) , (LOCATION Kenya) ( (ORGANIZATION AP) ) _ (CARDINAL Thousands) of laborers, ... on (DATE Saturday) ...) Parsed Corpora ============== The Treebank corpora provide a syntactic parse for each sentence. The NLTK data package includes a 10% sample of the Penn Treebank (in ``treebank``), as well as the Sinica Treebank (in ``sinica_treebank``). Reading the Penn Treebank (Wall Street Journal sample): >>> from nltk.corpus import treebank >>> print(treebank.fileids()) # doctest: +ELLIPSIS ['wsj_0001.mrg', 'wsj_0002.mrg', 'wsj_0003.mrg', 'wsj_0004.mrg', ...] >>> print(treebank.words('wsj_0003.mrg')) ['A', 'form', 'of', 'asbestos', 'once', 'used', ...] >>> print(treebank.tagged_words('wsj_0003.mrg')) [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...] >>> print(treebank.parsed_sents('wsj_0003.mrg')[0]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE (S (S-TPC-1 (NP-SBJ (NP (NP (DT A) (NN form)) (PP (IN of) (NP (NN asbestos)))) (RRC ...)...)...) ... (VP (VBD reported) (SBAR (-NONE- 0) (S (-NONE- *T*-1)))) (. .)) If you have access to a full installation of the Penn Treebank, NLTK can be configured to load it as well. Download the ``ptb`` package, and in the directory ``nltk_data/corpora/ptb`` place the ``BROWN`` and ``WSJ`` directories of the Treebank installation (symlinks work as well). Then use the ``ptb`` module instead of ``treebank``: >>> from nltk.corpus import ptb >>> print(ptb.fileids()) # doctest: +SKIP ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG', ...] >>> print(ptb.words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP ['A', 'form', 'of', 'asbestos', 'once', 'used', '*', ...] >>> print(ptb.tagged_words('WSJ/00/WSJ_0003.MRG')) # doctest: +SKIP [('A', 'DT'), ('form', 'NN'), ('of', 'IN'), ...] ...and so forth, like ``treebank`` but with extended fileids. Categories specified in ``allcats.txt`` can be used to filter by genre; they consist of ``news`` (for WSJ articles) and names of the Brown subcategories (``fiction``, ``humor``, ``romance``, etc.): >>> ptb.categories() # doctest: +SKIP ['adventure', 'belles_lettres', 'fiction', 'humor', 'lore', 'mystery', 'news', 'romance', 'science_fiction'] >>> print(ptb.fileids('news')) # doctest: +SKIP ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG', ...] >>> print(ptb.words(categories=['humor','fiction'])) # doctest: +SKIP ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...] As PropBank and NomBank depend on the (WSJ portion of the) Penn Treebank, the modules ``propbank_ptb`` and ``nombank_ptb`` are provided for access to a full PTB installation. Reading the Sinica Treebank: >>> from nltk.corpus import sinica_treebank >>> print(sinica_treebank.sents()) # doctest: +SKIP [['\xe4\xb8\x80'], ['\xe5\x8f\x8b\xe6\x83\x85'], ...] >>> sinica_treebank.parsed_sents()[25] # doctest: +SKIP Tree('S', [Tree('NP', [Tree('Nba', ['\xe5\x98\x89\xe7\x8f\x8d'])]), Tree('V\xe2\x80\xa7\xe5\x9c\xb0', [Tree('VA11', ['\xe4\xb8\x8d\xe5\x81\x9c']), Tree('DE', ['\xe7\x9a\x84'])]), Tree('VA4', ['\xe5\x93\xad\xe6\xb3\xa3'])]) Reading the CoNLL 2007 Dependency Treebanks: >>> from nltk.corpus import conll2007 >>> conll2007.sents('esp.train')[0] # doctest: +SKIP ['El', 'aumento', 'del', 'índice', 'de', 'desempleo', ...] >>> conll2007.parsed_sents('esp.train')[0] # doctest: +SKIP >>> print(conll2007.parsed_sents('esp.train')[0].tree()) # doctest: +SKIP (fortaleció (aumento El (del (índice (de (desempleo estadounidense))))) hoy considerablemente (al (euro (cotizaba , que (a (15.35 las GMT)) se (en (mercado el (de divisas) (de Fráncfort))) (a 0,9452_dólares) (frente_a , (0,9349_dólares los (de (mañana esta))))))) .) NLTK also provides a corpus reader for the York-Toronto-Helsinki Parsed Corpus of Old English Prose (YCOE); but the corpus itself is not included in the NLTK data package. If you install it yourself, you can use NLTK to access it: >>> from nltk.corpus import ycoe >>> for tree in ycoe.parsed_sents('cocuraC')[:4]: ... print(tree) # doctest: +SKIP (CP-THT (C +D+atte) (IP-SUB ...) ... (. .)) (IP-MAT (IP-MAT-0 (PP (P On) (NP (ADJ o+dre) (N wisan)))...) ... (. .)) (IP-MAT (NP-NOM-x-2 *exp*) (NP-DAT-1 (D^D +D+am) (ADJ^D unge+dyldegum)) ... (. .)) (IP-MAT (ADVP (ADV Sw+a)) (NP-NOM-x (PRO^N hit)) (ADVP-TMP (ADV^T oft)) ... (. .)) If the YCOE corpus is not available, you will get an error message when you try to access it: >>> from nltk.corpus import ycoe >>> print(ycoe) # doctest: +SKIP Traceback (most recent call last): LookupError: ********************************************************************** Resource 'corpora/ycoe' not found. For installation instructions, please see . Searched in: - ... ********************************************************************** Word Lists and Lexicons ======================= The NLTK data package also includes a number of lexicons and word lists. These are accessed just like text corpora. The following examples illustrate the use of the wordlist corpora: >>> from nltk.corpus import names, stopwords, words >>> words.fileids() ['en', 'en-basic'] >>> words.words('en') # doctest: +ELLIPSIS ['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', ...] >>> stopwords.fileids() # doctest: +ELLIPSIS ['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', ...] >>> stopwords.words('portuguese') # doctest: +ELLIPSIS ['de', 'a', 'o', 'que', 'e', 'do', 'da', 'em', 'um', 'para', ...] >>> names.fileids() ['female.txt', 'male.txt'] >>> names.words('male.txt') # doctest: +ELLIPSIS ['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', ...] >>> names.words('female.txt') # doctest: +ELLIPSIS ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', ...] The CMU Pronunciation Dictionary corpus contains pronounciation transcriptions for over 100,000 words. It can be accessed as a list of entries (where each entry consists of a word, an identifier, and a transcription) or as a dictionary from words to lists of transcriptions. Transcriptions are encoded as tuples of phoneme strings. >>> from nltk.corpus import cmudict >>> print(cmudict.entries()[653:659]) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [('acetate', ['AE1', 'S', 'AH0', 'T', 'EY2', 'T']), ('acetic', ['AH0', 'S', 'EH1', 'T', 'IH0', 'K']), ('acetic', ['AH0', 'S', 'IY1', 'T', 'IH0', 'K']), ('aceto', ['AA0', 'S', 'EH1', 'T', 'OW0']), ('acetochlor', ['AA0', 'S', 'EH1', 'T', 'OW0', 'K', 'L', 'AO2', 'R']), ('acetone', ['AE1', 'S', 'AH0', 'T', 'OW2', 'N'])] >>> # Load the entire cmudict corpus into a Python dictionary: >>> transcr = cmudict.dict() >>> print([transcr[w][0] for w in 'Natural Language Tool Kit'.lower().split()]) # doctest: +NORMALIZE_WHITESPACE [['N', 'AE1', 'CH', 'ER0', 'AH0', 'L'], ['L', 'AE1', 'NG', 'G', 'W', 'AH0', 'JH'], ['T', 'UW1', 'L'], ['K', 'IH1', 'T']] WordNet ======= Please see the separate WordNet howto. FrameNet ======== Please see the separate FrameNet howto. PropBank ======== Please see the separate PropBank howto. SentiWordNet ============ Please see the separate SentiWordNet howto. Categorized Corpora =================== Several corpora included with NLTK contain documents that have been categorized for topic, genre, polarity, etc. In addition to the standard corpus interface, these corpora provide access to the list of categories and the mapping between the documents and their categories (in both directions). Access the categories using the ``categories()`` method, e.g.: >>> from nltk.corpus import brown, movie_reviews, reuters >>> brown.categories() # doctest: +NORMALIZE_WHITESPACE ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] >>> movie_reviews.categories() ['neg', 'pos'] >>> reuters.categories() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS ['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', ...] This method has an optional argument that specifies a document or a list of documents, allowing us to map from (one or more) documents to (one or more) categories: >>> brown.categories('ca01') ['news'] >>> brown.categories(['ca01','cb01']) ['editorial', 'news'] >>> reuters.categories('training/9865') ['barley', 'corn', 'grain', 'wheat'] >>> reuters.categories(['training/9865', 'training/9880']) ['barley', 'corn', 'grain', 'money-fx', 'wheat'] We can go back the other way using the optional argument of the ``fileids()`` method: >>> reuters.fileids('barley') # doctest: +ELLIPSIS ['test/15618', 'test/15649', 'test/15676', 'test/15728', 'test/15871', ...] Both the ``categories()`` and ``fileids()`` methods return a sorted list containing no duplicates. In addition to mapping between categories and documents, these corpora permit direct access to their contents via the categories. Instead of accessing a subset of a corpus by specifying one or more fileids, we can identify one or more categories, e.g.: >>> brown.tagged_words(categories='news') [('The', 'AT'), ('Fulton', 'NP-TL'), ...] >>> brown.sents(categories=['editorial','reviews']) # doctest: +NORMALIZE_WHITESPACE [['Assembly', 'session', 'brought', 'much', 'good'], ['The', 'General', 'Assembly', ',', 'which', 'adjourns', 'today', ',', 'has', 'performed', 'in', 'an', 'atmosphere', 'of', 'crisis', 'and', 'struggle', 'from', 'the', 'day', 'it', 'convened', '.'], ...] Note that it is an error to specify both documents and categories. In the context of a text categorization system, we can easily test if the category assigned to a document is correct as follows: >>> def classify(doc): return 'news' # Trivial classifier >>> doc = 'ca01' >>> classify(doc) in brown.categories(doc) True Other Corpora ============= comparative_sentences --------------------- A list of sentences from various sources, especially reviews and articles. Each line contains one sentence; sentences were separated by using a sentence tokenizer. Comparative sentences have been annotated with their type, entities, features and keywords. >>> from nltk.corpus import comparative_sentences >>> comparison = comparative_sentences.comparisons()[0] >>> comparison.text ['its', 'fast-forward', 'and', 'rewind', 'work', 'much', 'more', 'smoothly', 'and', 'consistently', 'than', 'those', 'of', 'other', 'models', 'i', "'ve", 'had', '.'] >>> comparison.entity_2 'models' >>> (comparison.feature, comparison.keyword) ('rewind', 'more') >>> len(comparative_sentences.comparisons()) 853 opinion_lexicon --------------- A list of positive and negative opinion words or sentiment words for English. >>> from nltk.corpus import opinion_lexicon >>> opinion_lexicon.words()[:4] ['2-faced', '2-faces', 'abnormal', 'abolish'] The OpinionLexiconCorpusReader also provides shortcuts to retrieve positive/negative words: >>> opinion_lexicon.negative()[:4] ['2-faced', '2-faces', 'abnormal', 'abolish'] Note that words from `words()` method in opinion_lexicon are sorted by file id, not alphabetically: >>> opinion_lexicon.words()[0:10] ['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort', 'aborted'] >>> sorted(opinion_lexicon.words())[0:10] ['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably', 'abominate', 'abomination', 'abort'] ppattach -------- The Prepositional Phrase Attachment corpus is a corpus of prepositional phrase attachment decisions. Each instance in the corpus is encoded as a ``PPAttachment`` object: >>> from nltk.corpus import ppattach >>> ppattach.attachments('training') # doctest: +NORMALIZE_WHITESPACE [PPAttachment(sent='0', verb='join', noun1='board', prep='as', noun2='director', attachment='V'), PPAttachment(sent='1', verb='is', noun1='chairman', prep='of', noun2='N.V.', attachment='N'), ...] >>> inst = ppattach.attachments('training')[0] >>> (inst.sent, inst.verb, inst.noun1, inst.prep, inst.noun2) ('0', 'join', 'board', 'as', 'director') >>> inst.attachment 'V' product_reviews_1 and product_reviews_2 --------------------------------------- These two datasets respectively contain annotated customer reviews of 5 and 9 products from amazon.com. >>> from nltk.corpus import product_reviews_1 >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt') >>> review = camera_reviews[0] >>> review.sents()[0] ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am', 'extremely', 'satisfied', 'with', 'the', 'purchase', '.'] >>> review.features() [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'), ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'), ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'), ('option', '+1')] It is also possible to reach the same information directly from the stream: >>> product_reviews_1.features('Canon_G3.txt') [('canon powershot g3', '+3'), ('use', '+2'), ...] We can compute stats for specific product features: >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture']) >>> # We use float for backward compatibility with division in Python2.7 >>> mean = float(tot)/n_reviews >>> print(n_reviews, tot, mean) 15 24 1.6 pros_cons --------- A list of pros/cons sentences for determining context (aspect) dependent sentiment words, which are then applied to sentiment analysis of comparative sentences. >>> from nltk.corpus import pros_cons >>> pros_cons.sents(categories='Cons') [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy', 'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'], ...] >>> pros_cons.words('IntegratedPros.txt') ['Easy', 'to', 'use', ',', 'economical', '!', ...] semcor ------ The Brown Corpus, annotated with WordNet senses. >>> from nltk.corpus import semcor >>> semcor.words('brown2/tagfiles/br-n12.xml') # doctest: +ELLIPSIS ['When', 'several', 'minutes', 'had', 'passed', ...] >>> sent = semcor.xml('brown2/tagfiles/br-n12.xml').findall('context/p/s')[0] >>> for wordform in sent.getchildren(): ... print(wordform.text, end=' ') ... for key in sorted(wordform.keys()): ... print(key + '=' + wordform.get(key), end=' ') ... print() ... When cmd=ignore pos=WRB several cmd=done lemma=several lexsn=5:00:00:some(a):00 pos=JJ wnsn=1 minutes cmd=done lemma=minute lexsn=1:28:00:: pos=NN wnsn=1 had cmd=done ot=notag pos=VBD passed cmd=done lemma=pass lexsn=2:38:03:: pos=VB wnsn=4 and cmd=ignore pos=CC Curt cmd=done lemma=person lexsn=1:03:00:: pn=person pos=NNP rdf=person wnsn=1 had cmd=done ot=notag pos=VBD n't cmd=done lemma=n't lexsn=4:02:00:: pos=RB wnsn=0 emerged cmd=done lemma=emerge lexsn=2:30:00:: pos=VB wnsn=1 from cmd=ignore pos=IN the cmd=ignore pos=DT livery_stable cmd=done lemma=livery_stable lexsn=1:06:00:: pos=NN wnsn=1 , Brenner cmd=done lemma=person lexsn=1:03:00:: pn=person pos=NNP rdf=person wnsn=1 re-entered cmd=done lemma=re-enter lexsn=2:38:00:: pos=VB wnsn=1 the cmd=ignore pos=DT hotel cmd=done lemma=hotel lexsn=1:06:00:: pos=NN wnsn=1 and cmd=ignore pos=CC faced cmd=done lemma=face lexsn=2:42:02:: pos=VB wnsn=4 Summers cmd=done lemma=person lexsn=1:03:00:: pn=person pos=NNP rdf=person wnsn=1 across cmd=ignore pos=IN the cmd=ignore pos=DT counter cmd=done lemma=counter lexsn=1:06:00:: pos=NN wnsn=1 . senseval -------- The Senseval 2 corpus is a word sense disambiguation corpus. Each item in the corpus corresponds to a single ambiguous word. For each of these words, the corpus contains a list of instances, corresponding to occurrences of that word. Each instance provides the word; a list of word senses that apply to the word occurrence; and the word's context. >>> from nltk.corpus import senseval >>> senseval.fileids() ['hard.pos', 'interest.pos', 'line.pos', 'serve.pos'] >>> senseval.instances('hard.pos') ... # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [SensevalInstance(word='hard-a', position=20, context=[('``', '``'), ('he', 'PRP'), ...('hard', 'JJ'), ...], senses=('HARD1',)), SensevalInstance(word='hard-a', position=10, context=[('clever', 'NNP'), ...('hard', 'JJ'), ('time', 'NN'), ...], senses=('HARD1',)), ...] The following code looks at instances of the word 'interest', and displays their local context (2 words on each side) and word sense(s): >>> for inst in senseval.instances('interest.pos')[:10]: ... p = inst.position ... left = ' '.join(w for (w,t) in inst.context[p-2:p]) ... word = ' '.join(w for (w,t) in inst.context[p:p+1]) ... right = ' '.join(w for (w,t) in inst.context[p+1:p+3]) ... senses = ' '.join(inst.senses) ... print('%20s |%10s | %-15s -> %s' % (left, word, right, senses)) declines in | interest | rates . -> interest_6 indicate declining | interest | rates because -> interest_6 in short-term | interest | rates . -> interest_6 4 % | interest | in this -> interest_5 company with | interests | in the -> interest_5 , plus | interest | . -> interest_6 set the | interest | rate on -> interest_6 's own | interest | , prompted -> interest_4 principal and | interest | is the -> interest_6 increase its | interest | to 70 -> interest_5 sentence_polarity ----------------- The Sentence Polarity dataset contains 5331 positive and 5331 negative processed sentences. >>> from nltk.corpus import sentence_polarity >>> sentence_polarity.sents() [['simplistic', ',', 'silly', 'and', 'tedious', '.'], ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.'], ...] >>> sentence_polarity.categories() ['neg', 'pos'] >>> sentence_polarity.sents()[1] ["it's", 'so', 'laddish', 'and', 'juvenile', ',', 'only', 'teenage', 'boys', 'could', 'possibly', 'find', 'it', 'funny', '.'] shakespeare ----------- The Shakespeare corpus contains a set of Shakespeare plays, formatted as XML files. These corpora are returned as ElementTree objects: >>> from nltk.corpus import shakespeare >>> from xml.etree import ElementTree >>> shakespeare.fileids() # doctest: +ELLIPSIS ['a_and_c.xml', 'dream.xml', 'hamlet.xml', 'j_caesar.xml', ...] >>> play = shakespeare.xml('dream.xml') >>> print(play) # doctest: +ELLIPSIS >>> print('%s: %s' % (play[0].tag, play[0].text)) TITLE: A Midsummer Night's Dream >>> personae = [persona.text for persona in ... play.findall('PERSONAE/PERSONA')] >>> print(personae) # doctest: +ELLIPSIS ['THESEUS, Duke of Athens.', 'EGEUS, father to Hermia.', ...] >>> # Find and print speakers not listed as personae >>> names = [persona.split(',')[0] for persona in personae] >>> speakers = set(speaker.text for speaker in ... play.findall('*/*/*/SPEAKER')) >>> print(sorted(speakers.difference(names))) # doctest: +NORMALIZE_WHITESPACE ['ALL', 'COBWEB', 'DEMETRIUS', 'Fairy', 'HERNIA', 'LYSANDER', 'Lion', 'MOTH', 'MUSTARDSEED', 'Moonshine', 'PEASEBLOSSOM', 'Prologue', 'Pyramus', 'Thisbe', 'Wall'] subjectivity ----------- The Subjectivity Dataset contains 5000 subjective and 5000 objective processed sentences. >>> from nltk.corpus import subjectivity >>> subjectivity.categories() ['obj', 'subj'] >>> subjectivity.sents()[23] ['television', 'made', 'him', 'famous', ',', 'but', 'his', 'biggest', 'hits', 'happened', 'off', 'screen', '.'] >>> subjectivity.words(categories='subj') ['smart', 'and', 'alert', ',', 'thirteen', ...] toolbox ------- The Toolbox corpus distributed with NLTK contains a sample lexicon and several sample texts from the Rotokas language. The Toolbox corpus reader returns Toolbox files as XML ElementTree objects. The following example loads the Rotokas dictionary, and figures out the distribution of part-of-speech tags for reduplicated words. .. doctest: +SKIP >>> from nltk.corpus import toolbox >>> from nltk.probability import FreqDist >>> from xml.etree import ElementTree >>> import re >>> rotokas = toolbox.xml('rotokas.dic') >>> redup_pos_freqdist = FreqDist() >>> # Note: we skip over the first record, which is actually >>> # the header. >>> for record in rotokas[1:]: ... lexeme = record.find('lx').text ... if re.match(r'(.*)\1$', lexeme): ... redup_pos_freqdist[record.find('ps').text] += 1 >>> for item, count in redup_pos_freqdist.most_common(): ... print(item, count) V 41 N 14 ??? 4 This example displays some records from a Rotokas text: .. doctest: +SKIP >>> river = toolbox.xml('rotokas/river.txt', key='ref') >>> for record in river.findall('record')[:3]: ... for piece in record: ... if len(piece.text) > 60: ... print('%-6s %s...' % (piece.tag, piece.text[:57])) ... else: ... print('%-6s %s' % (piece.tag, piece.text)) ref Paragraph 1 t ``Viapau oisio ra ovaupasi ... m viapau oisio ra ovau -pa -si ... g NEG this way/like this and forget -PROG -2/3.DL... p NEG ??? CONJ V.I -SUFF.V.3 -SUFF.V... f ``No ken lus tingting wanema samting papa i bin tok,'' Na... fe ``Don't forget what Dad said,'' yelled Naomi. ref 2 t Osa Ira ora Reviti viapau uvupasiva. m osa Ira ora Reviti viapau uvu -pa -si ... g as/like name and name NEG hear/smell -PROG -2/3... p CONJ N.PN CONJ N.PN NEG V.T -SUFF.V.3 -SUF... f Tasol Ila na David no bin harim toktok. fe But Ila and David took no notice. ref 3 t Ikaupaoro rokosiva ... m ikau -pa -oro roko -si -va ... g run/hurry -PROG -SIM go down -2/3.DL.M -RP ... p V.T -SUFF.V.3 -SUFF.V.4 ADV -SUFF.V.4 -SUFF.VT.... f Tupela i bin hariap i go long wara . fe They raced to the river. timit ----- The NLTK data package includes a fragment of the TIMIT Acoustic-Phonetic Continuous Speech Corpus. This corpus is broken down into small speech samples, each of which is available as a wave file, a phonetic transcription, and a tokenized word list. >>> from nltk.corpus import timit >>> print(timit.utteranceids()) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', 'dr1-fvmh0/si2096', 'dr1-fvmh0/si836', 'dr1-fvmh0/sx116', 'dr1-fvmh0/sx206', 'dr1-fvmh0/sx26', 'dr1-fvmh0/sx296', ...] >>> item = timit.utteranceids()[5] >>> print(timit.phones(item)) # doctest: +NORMALIZE_WHITESPACE ['h#', 'k', 'l', 'ae', 's', 'pcl', 'p', 'dh', 'ax', 's', 'kcl', 'k', 'r', 'ux', 'ix', 'nx', 'y', 'ax', 'l', 'eh', 'f', 'tcl', 't', 'hh', 'ae', 'n', 'dcl', 'd', 'h#'] >>> print(timit.words(item)) ['clasp', 'the', 'screw', 'in', 'your', 'left', 'hand'] >>> timit.play(item) # doctest: +SKIP The corpus reader can combine the word segmentation information with the phonemes to produce a single tree structure: >>> for tree in timit.phone_trees(item): ... print(tree) (S h# (clasp k l ae s pcl p) (the dh ax) (screw s kcl k r ux) (in ix nx) (your y ax) (left l eh f tcl t) (hand hh ae n dcl d) h#) The start time and stop time of each phoneme, word, and sentence are also available: >>> print(timit.phone_times(item)) # doctest: +ELLIPSIS [('h#', 0, 2190), ('k', 2190, 3430), ('l', 3430, 4326), ...] >>> print(timit.word_times(item)) # doctest: +ELLIPSIS [('clasp', 2190, 8804), ('the', 8804, 9734), ...] >>> print(timit.sent_times(item)) [('Clasp the screw in your left hand.', 0, 32154)] We can use these times to play selected pieces of a speech sample: >>> timit.play(item, 2190, 8804) # 'clasp' # doctest: +SKIP The corpus reader can also be queried for information about the speaker and sentence identifier for a given speech sample: >>> print(timit.spkrid(item)) dr1-fvmh0 >>> print(timit.sentid(item)) sx116 >>> print(timit.spkrinfo(timit.spkrid(item))) # doctest: +NORMALIZE_WHITESPACE SpeakerInfo(id='VMH0', sex='F', dr='1', use='TRN', recdate='03/11/86', birthdate='01/08/60', ht='5\'05"', race='WHT', edu='BS', comments='BEST NEW ENGLAND ACCENT SO FAR') >>> # List the speech samples from the same speaker: >>> timit.utteranceids(spkrid=timit.spkrid(item)) # doctest: +ELLIPSIS ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...] twitter_samples --------------- Twitter is well-known microblog service that allows public data to be collected via APIs. NLTK's twitter corpus currently contains a sample of 20k Tweets retrieved from the Twitter Streaming API. >>> from nltk.corpus import twitter_samples >>> twitter_samples.fileids() ['negative_tweets.json', 'positive_tweets.json', 'tweets.20150430-223406.json'] We follow standard practice in storing full Tweets as line-separated JSON. These data structures can be accessed via `tweets.docs()`. However, in general it is more practical to focus just on the text field of the Tweets, which are accessed via the `strings()` method. >>> twitter_samples.strings('tweets.20150430-223406.json') ['RT @KirkKus: Indirect cost of the UK being in the EU is estimated to be costing Britain \xa3170 billion per year! #BetterOffOut #UKIP', ...] The default tokenizer for Tweets is specialised for 'casual' text, and the `tokenized()` method returns a list of lists of tokens. >>> twitter_samples.tokenized('tweets.20150430-223406.json') [['RT', '@KirkKus', ':', 'Indirect', 'cost', 'of', 'the', 'UK', 'being', 'in', ...], ['VIDEO', ':', 'Sturgeon', 'on', 'post-election', 'deals', 'http://t.co/BTJwrpbmOY'], ...] rte --- The RTE (Recognizing Textual Entailment) corpus was derived from the RTE1, RTE2 and RTE3 datasets (dev and test data), and consists of a list of XML-formatted 'text'/'hypothesis' pairs. >>> from nltk.corpus import rte >>> print(rte.fileids()) # doctest: +ELLIPSIS ['rte1_dev.xml', 'rte1_test.xml', 'rte2_dev.xml', ..., 'rte3_test.xml'] >>> rtepairs = rte.pairs(['rte2_test.xml', 'rte3_test.xml']) >>> print(rtepairs) # doctest: +ELLIPSIS [, , , ...] In the gold standard test sets, each pair is labeled according to whether or not the text 'entails' the hypothesis; the entailment value is mapped to an integer 1 (True) or 0 (False). >>> rtepairs[5] >>> rtepairs[5].text # doctest: +NORMALIZE_WHITESPACE 'His wife Strida won a seat in parliament after forging an alliance with the main anti-Syrian coalition in the recent election.' >>> rtepairs[5].hyp 'Strida elected to parliament.' >>> rtepairs[5].value 1 The RTE corpus also supports an ``xml()`` method which produces ElementTrees. >>> xmltree = rte.xml('rte3_dev.xml') >>> xmltree # doctest: +SKIP >>> xmltree[7].findtext('t') # doctest: +NORMALIZE_WHITESPACE "Mrs. Bush's approval ratings have remained very high, above 80%, even as her husband's have recently dropped below 50%." verbnet ------- The VerbNet corpus is a lexicon that divides verbs into classes, based on their syntax-semantics linking behavior. The basic elements in the lexicon are verb lemmas, such as 'abandon' and 'accept', and verb classes, which have identifiers such as 'remove-10.1' and 'admire-31.2-1'. These class identifiers consist of a representative verb selected from the class, followed by a numerical identifier. The list of verb lemmas, and the list of class identifiers, can be retrieved with the following methods: >>> from nltk.corpus import verbnet >>> verbnet.lemmas()[20:25] ['accelerate', 'accept', 'acclaim', 'accompany', 'accrue'] >>> verbnet.classids()[:5] ['accompany-51.7', 'admire-31.2', 'admire-31.2-1', 'admit-65', 'adopt-93'] The `classids()` method may also be used to retrieve the classes that a given lemma belongs to: >>> verbnet.classids('accept') ['approve-77', 'characterize-29.2-1-1', 'obtain-13.5.2'] The primary object in the lexicon is a class record, which is stored as an ElementTree xml object. The class record for a given class identifier is returned by the `vnclass()` method: >>> verbnet.vnclass('remove-10.1') # doctest: +ELLIPSIS The `vnclass()` method also accepts "short" identifiers, such as '10.1': >>> verbnet.vnclass('10.1') # doctest: +ELLIPSIS See the Verbnet documentation, or the Verbnet files, for information about the structure of this xml. As an example, we can retrieve a list of thematic roles for a given Verbnet class: >>> vn_31_2 = verbnet.vnclass('admire-31.2') >>> for themrole in vn_31_2.findall('THEMROLES/THEMROLE'): ... print(themrole.attrib['type'], end=' ') ... for selrestr in themrole.findall('SELRESTRS/SELRESTR'): ... print('[%(Value)s%(type)s]' % selrestr.attrib, end=' ') ... print() Theme Experiencer [+animate] Predicate The Verbnet corpus also provides a variety of pretty printing functions that can be used to display the xml contents in a more concise form. The simplest such method is `pprint()`: >>> print(verbnet.pprint('57')) weather-57 Subclasses: (none) Members: blow clear drizzle fog freeze gust hail howl lightning mist mizzle pelt pour precipitate rain roar shower sleet snow spit spot sprinkle storm swelter teem thaw thunder Thematic roles: * Theme[+concrete +force] Frames: Intransitive (Expletive Subject) Syntax: LEX[it] LEX[[+be]] VERB Semantics: * weather(during(E), Weather_type, ?Theme) NP (Expletive Subject, Theme Object) Syntax: LEX[it] LEX[[+be]] VERB NP[Theme] Semantics: * weather(during(E), Weather_type, Theme) PP (Expletive Subject, Theme-PP) Syntax: LEX[it[+be]] VERB PREP[with] NP[Theme] Semantics: * weather(during(E), Weather_type, Theme) nps_chat -------- The NPS Chat Corpus, Release 1.0 consists of over 10,000 posts in age-specific chat rooms, which have been anonymized, POS-tagged and dialogue-act tagged. >>> print(nltk.corpus.nps_chat.words()) ['now', 'im', 'left', 'with', 'this', 'gay', ...] >>> print(nltk.corpus.nps_chat.tagged_words()) [('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ...] >>> print(nltk.corpus.nps_chat.tagged_posts()) # doctest: +NORMALIZE_WHITESPACE [[('now', 'RB'), ('im', 'PRP'), ('left', 'VBD'), ('with', 'IN'), ('this', 'DT'), ('gay', 'JJ'), ('name', 'NN')], [(':P', 'UH')], ...] We can access the XML elements corresponding to individual posts. These elements have ``class`` and ``user`` attributes that we can access using ``p.attrib['class']`` and ``p.attrib['user']``. They also have text content, accessed using ``p.text``. >>> print(nltk.corpus.nps_chat.xml_posts()) # doctest: +ELLIPSIS [, , ...] >>> posts = nltk.corpus.nps_chat.xml_posts() >>> sorted(nltk.FreqDist(p.attrib['class'] for p in posts).keys()) ['Accept', 'Bye', 'Clarify', 'Continuer', 'Emotion', 'Emphasis', 'Greet', 'Other', 'Reject', 'Statement', 'System', 'nAnswer', 'whQuestion', 'yAnswer', 'ynQuestion'] >>> posts[0].text 'now im left with this gay name' In addition to the above methods for accessing tagged text, we can navigate the XML structure directly, as follows: >>> tokens = posts[0].findall('terminals/t') >>> [t.attrib['pos'] + "/" + t.attrib['word'] for t in tokens] ['RB/now', 'PRP/im', 'VBD/left', 'IN/with', 'DT/this', 'JJ/gay', 'NN/name'] multext_east ------------ The Multext-East Corpus consists of POS-tagged versions of George Orwell's book 1984 in 12 languages: English, Czech, Hungarian, Macedonian, Slovenian, Serbian, Slovak, Romanian, Estonian, Farsi, Bulgarian and Polish. The corpus can be accessed using the usual methods for tagged corpora. The tagset can be transformed from the Multext-East specific MSD tags to the Universal tagset using the "tagset" parameter of all functions returning tagged parts of the corpus. >>> print(nltk.corpus.multext_east.words("oana-en.xml")) ['It', 'was', 'a', 'bright', ...] >>> print(nltk.corpus.multext_east.tagged_words("oana-en.xml")) [('It', '#Pp3ns'), ('was', '#Vmis3s'), ('a', '#Di'), ...] >>> print(nltk.corpus.multext_east.tagged_sents("oana-en.xml", "universal")) [[('It', 'PRON'), ('was', 'VERB'), ('a', 'DET'), ...] --------------------- Corpus Reader Classes --------------------- NLTK's *corpus reader* classes are used to access the contents of a diverse set of corpora. Each corpus reader class is specialized to handle a specific corpus format. Examples include the `PlaintextCorpusReader`, which handles corpora that consist of a set of unannotated text files, and the `BracketParseCorpusReader`, which handles corpora that consist of files containing parenthesis-delineated parse trees. Automatically Created Corpus Reader Instances ============================================= When the `nltk.corpus` module is imported, it automatically creates a set of corpus reader instances that can be used to access the corpora in the NLTK data distribution. Here is a small sample of those corpus reader instances: >>> import nltk >>> nltk.corpus.brown # doctest: +ELLIPSIS >>> nltk.corpus.treebank # doctest: +ELLIPSIS >>> nltk.corpus.names # doctest: +ELLIPSIS >>> nltk.corpus.genesis # doctest: +ELLIPSIS >>> nltk.corpus.inaugural # doctest: +ELLIPSIS This sample illustrates that different corpus reader classes are used to read different corpora; but that the same corpus reader class may be used for more than one corpus (e.g., ``genesis`` and ``inaugural``). Creating New Corpus Reader Instances ==================================== Although the `nltk.corpus` module automatically creates corpus reader instances for the corpora in the NLTK data distribution, you may sometimes need to create your own corpus reader. In particular, you would need to create your own corpus reader if you want... - To access a corpus that is not included in the NLTK data distribution. - To access a full copy of a corpus for which the NLTK data distribution only provides a sample. - To access a corpus using a customized corpus reader (e.g., with a customized tokenizer). To create a new corpus reader, you will first need to look up the signature for that corpus reader's constructor. Different corpus readers have different constructor signatures, but most of the constructor signatures have the basic form:: SomeCorpusReader(root, files, ...options...) Where ``root`` is an absolute path to the directory containing the corpus data files; ``files`` is either a list of file names (relative to ``root``) or a regexp specifying which files should be included; and ``options`` are additional reader-specific options. For example, we can create a customized corpus reader for the genesis corpus that uses a different sentence tokenizer as follows: >>> # Find the directory where the corpus lives. >>> genesis_dir = nltk.data.find('corpora/genesis') >>> # Create our custom sentence tokenizer. >>> my_sent_tokenizer = nltk.RegexpTokenizer('[^.!?]+') >>> # Create the new corpus reader object. >>> my_genesis = nltk.corpus.PlaintextCorpusReader( ... genesis_dir, '.*\.txt', sent_tokenizer=my_sent_tokenizer) >>> # Use the new corpus reader object. >>> print(my_genesis.sents('english-kjv.txt')[0]) # doctest: +NORMALIZE_WHITESPACE ['In', 'the', 'beginning', 'God', 'created', 'the', 'heaven', 'and', 'the', 'earth'] If you wish to read your own plaintext corpus, which is stored in the directory '/usr/share/some-corpus', then you can create a corpus reader for it with:: >>> my_corpus = nltk.corpus.PlaintextCorpusReader( ... '/usr/share/some-corpus', '.*\.txt') # doctest: +SKIP For a complete list of corpus reader subclasses, see the API documentation for `nltk.corpus.reader`. Corpus Types ============ Corpora vary widely in the types of content they include. This is reflected in the fact that the base class `CorpusReader` only defines a few general-purpose methods for listing and accessing the files that make up a corpus. It is up to the subclasses to define *data access methods* that provide access to the information in the corpus. However, corpus reader subclasses should be consistent in their definitions of these data access methods wherever possible. At a high level, corpora can be divided into three basic types: - A *token corpus* contains information about specific occurences of language use (or linguistic tokens), such as dialogues or written texts. Examples of token corpora are collections of written text and collections of speech. - A *type corpus*, or *lexicon*, contains information about a coherent set of lexical items (or linguistic types). Examples of lexicons are dictionaries and word lists. - A *language description corpus* contains information about a set of non-lexical linguistic constructs, such as grammar rules. However, many individual corpora blur the distinctions between these types. For example, corpora that are primarily lexicons may include token data in the form of example sentences; and corpora that are primarily token corpora may be accompanied by one or more word lists or other lexical data sets. Because corpora vary so widely in their information content, we have decided that it would not be wise to use separate corpus reader base classes for different corpus types. Instead, we simply try to make the corpus readers consistent wherever possible, but let them differ where the underlying data itself differs. Common Corpus Reader Methods ============================ As mentioned above, there are only a handful of methods that all corpus readers are guaranteed to implement. These methods provide access to the files that contain the corpus data. Every corpus is assumed to consist of one or more files, all located in a common root directory (or in subdirectories of that root directory). The absolute path to the root directory is stored in the ``root`` property: >>> import os >>> str(nltk.corpus.genesis.root).replace(os.path.sep,'/') # doctest: +ELLIPSIS '.../nltk_data/corpora/genesis' Each file within the corpus is identified by a platform-independent identifier, which is basically a path string that uses ``/`` as the path separator. I.e., this identifier can be converted to a relative path as follows: >>> some_corpus_file_id = nltk.corpus.reuters.fileids()[0] >>> import os.path >>> os.path.normpath(some_corpus_file_id).replace(os.path.sep,'/') 'test/14826' To get a list of all data files that make up a corpus, use the ``fileids()`` method. In some corpora, these files will not all contain the same type of data; for example, for the ``nltk.corpus.timit`` corpus, ``fileids()`` will return a list including text files, word segmentation files, phonetic transcription files, sound files, and metadata files. For corpora with diverse file types, the ``fileids()`` method will often take one or more optional arguments, which can be used to get a list of the files with a specific file type: >>> nltk.corpus.timit.fileids() # doctest: +ELLIPSIS ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...] >>> nltk.corpus.timit.fileids('phn') # doctest: +ELLIPSIS ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa2.phn', 'dr1-fvmh0/si1466.phn', ...] In some corpora, the files are divided into distinct categories. For these corpora, the ``fileids()`` method takes an optional argument, which can be used to get a list of the files within a specific category: >>> nltk.corpus.brown.fileids('hobbies') # doctest: +ELLIPSIS ['ce01', 'ce02', 'ce03', 'ce04', 'ce05', 'ce06', 'ce07', ...] The ``abspath()`` method can be used to find the absolute path to a corpus file, given its file identifier: >>> str(nltk.corpus.brown.abspath('ce06')).replace(os.path.sep,'/') # doctest: +ELLIPSIS '.../corpora/brown/ce06' The ``abspaths()`` method can be used to find the absolute paths for one corpus file, a list of corpus files, or (if no fileids are specified), all corpus files. This method is mainly useful as a helper method when defining corpus data access methods, since data access methods can usually be called with a string argument (to get a view for a specific file), with a list argument (to get a view for a specific list of files), or with no argument (to get a view for the whole corpus). Data Access Methods =================== Individual corpus reader subclasses typically extend this basic set of file-access methods with one or more *data access methods*, which provide easy access to the data contained in the corpus. The signatures for data access methods often have the basic form:: corpus_reader.some_data access(fileids=None, ...options...) Where ``fileids`` can be a single file identifier string (to get a view for a specific file); a list of file identifier strings (to get a view for a specific list of files); or None (to get a view for the entire corpus). Some of the common data access methods, and their return types, are: - I{corpus}.words(): list of str - I{corpus}.sents(): list of (list of str) - I{corpus}.paras(): list of (list of (list of str)) - I{corpus}.tagged_words(): list of (str,str) tuple - I{corpus}.tagged_sents(): list of (list of (str,str)) - I{corpus}.tagged_paras(): list of (list of (list of (str,str))) - I{corpus}.chunked_sents(): list of (Tree w/ (str,str) leaves) - I{corpus}.parsed_sents(): list of (Tree with str leaves) - I{corpus}.parsed_paras(): list of (list of (Tree with str leaves)) - I{corpus}.xml(): A single xml ElementTree - I{corpus}.raw(): str (unprocessed corpus contents) For example, the `words()` method is supported by many different corpora, and returns a flat list of word strings: >>> nltk.corpus.brown.words() ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] >>> nltk.corpus.treebank.words() ['Pierre', 'Vinken', ',', '61', 'years', 'old', ...] >>> nltk.corpus.conll2002.words() [u'Sao', u'Paulo', u'(', u'Brasil', u')', u',', u'23', ...] >>> nltk.corpus.genesis.words() [u'In', u'the', u'beginning', u'God', u'created', ...] On the other hand, the `tagged_words()` method is only supported by corpora that include part-of-speech annotations: >>> nltk.corpus.brown.tagged_words() [('The', 'AT'), ('Fulton', 'NP-TL'), ...] >>> nltk.corpus.treebank.tagged_words() [('Pierre', 'NNP'), ('Vinken', 'NNP'), ...] >>> nltk.corpus.conll2002.tagged_words() [(u'Sao', u'NC'), (u'Paulo', u'VMI'), (u'(', u'Fpa'), ...] >>> nltk.corpus.genesis.tagged_words() Traceback (most recent call last): ... AttributeError: 'PlaintextCorpusReader' object has no attribute 'tagged_words' Although most corpus readers use file identifiers to index their content, some corpora use different identifiers instead. For example, the data access methods for the ``timit`` corpus uses *utterance identifiers* to select which corpus items should be returned: >>> nltk.corpus.timit.utteranceids() # doctest: +ELLIPSIS ['dr1-fvmh0/sa1', 'dr1-fvmh0/sa2', 'dr1-fvmh0/si1466', ...] >>> nltk.corpus.timit.words('dr1-fvmh0/sa2') ["don't", 'ask', 'me', 'to', 'carry', 'an', 'oily', 'rag', 'like', 'that'] Attempting to call ``timit``\ 's data access methods with a file identifier will result in an exception: >>> nltk.corpus.timit.fileids() # doctest: +ELLIPSIS ['dr1-fvmh0/sa1.phn', 'dr1-fvmh0/sa1.txt', 'dr1-fvmh0/sa1.wav', ...] >>> nltk.corpus.timit.words('dr1-fvmh0/sa1.txt') # doctest: +SKIP Traceback (most recent call last): ... IOError: No such file or directory: '.../dr1-fvmh0/sa1.txt.wrd' As another example, the ``propbank`` corpus defines the ``roleset()`` method, which expects a roleset identifier, not a file identifier: >>> roleset = nltk.corpus.propbank.roleset('eat.01') >>> from xml.etree import ElementTree as ET >>> print(ET.tostring(roleset).decode('utf8')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE ...... ... ... Stream Backed Corpus Views ========================== An important feature of NLTK's corpus readers is that many of them access the underlying data files using "corpus views." A *corpus view* is an object that acts like a simple data structure (such as a list), but does not store the data elements in memory; instead, data elements are read from the underlying data files on an as-needed basis. By only loading items from the file on an as-needed basis, corpus views maintain both memory efficiency and responsiveness. The memory efficiency of corpus readers is important because some corpora contain very large amounts of data, and storing the entire data set in memory could overwhelm many machines. The responsiveness is important when experimenting with corpora in interactive sessions and in in-class demonstrations. The most common corpus view is the `StreamBackedCorpusView`, which acts as a read-only list of tokens. Two additional corpus view classes, `ConcatenatedCorpusView` and `LazySubsequence`, make it possible to create concatenations and take slices of `StreamBackedCorpusView` objects without actually storing the resulting list-like object's elements in memory. In the future, we may add additional corpus views that act like other basic data structures, such as dictionaries. Writing New Corpus Readers ========================== In order to add support for new corpus formats, it is necessary to define new corpus reader classes. For many corpus formats, writing new corpus readers is relatively straight-forward. In this section, we'll describe what's involved in creating a new corpus reader. If you do create a new corpus reader, we encourage you to contribute it back to the NLTK project. Don't Reinvent the Wheel ------------------------ Before you start writing a new corpus reader, you should check to be sure that the desired format can't be read using an existing corpus reader with appropriate constructor arguments. For example, although the `TaggedCorpusReader` assumes that words and tags are separated by ``/`` characters by default, an alternative tag-separation character can be specified via the ``sep`` constructor argument. You should also check whether the new corpus format can be handled by subclassing an existing corpus reader, and tweaking a few methods or variables. Design ------ If you decide to write a new corpus reader from scratch, then you should first decide which data access methods you want the reader to provide, and what their signatures should be. You should look at existing corpus readers that process corpora with similar data contents, and try to be consistent with those corpus readers whenever possible. You should also consider what sets of identifiers are appropriate for the corpus format. Where it's practical, file identifiers should be used. However, for some corpora, it may make sense to use additional sets of identifiers. Each set of identifiers should have a distinct name (e.g., fileids, utteranceids, rolesets); and you should be consistent in using that name to refer to that identifier. Do not use parameter names like ``id``, which leave it unclear what type of identifier is required. Once you've decided what data access methods and identifiers are appropriate for your corpus, you should decide if there are any customizable parameters that you'd like the corpus reader to handle. These parameters make it possible to use a single corpus reader to handle a wider variety of corpora. The ``sep`` argument for `TaggedCorpusReader`, mentioned above, is an example of a customizable corpus reader parameter. Implementation -------------- Constructor ~~~~~~~~~~~ If your corpus reader implements any customizable parameters, then you'll need to override the constructor. Typically, the new constructor will first call its base class's constructor, and then store the customizable parameters. For example, the `ConllChunkCorpusReader`\ 's constructor is defined as follows: >>> def __init__(self, root, files, chunk_types): ... CorpusReader.__init__(self, root, files) ... self.chunk_types = tuple(chunk_types) If your corpus reader does not implement any customization parameters, then you can often just inherit the base class's constructor. Data Access Methods ~~~~~~~~~~~~~~~~~~~ The most common type of data access method takes an argument identifying which files to access, and returns a view covering those files. This argument may be a single file identifier string (to get a view for a specific file); a list of file identifier strings (to get a view for a specific list of files); or None (to get a view for the entire corpus). The method's implementation converts this argument to a list of path names using the `abspaths()` method, which handles all three value types (string, list, and None): >>> print(str(nltk.corpus.brown.abspaths()).replace('\\\\','/')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [FileSystemPathPointer('.../corpora/brown/ca01'), FileSystemPathPointer('.../corpora/brown/ca02'), ...] >>> print(str(nltk.corpus.brown.abspaths('ce06')).replace('\\\\','/')) # doctest: +ELLIPSIS [FileSystemPathPointer('.../corpora/brown/ce06')] >>> print(str(nltk.corpus.brown.abspaths(['ce06', 'ce07'])).replace('\\\\','/')) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [FileSystemPathPointer('.../corpora/brown/ce06'), FileSystemPathPointer('.../corpora/brown/ce07')] An example of this type of method is the `words()` method, defined by the `PlaintextCorpusReader` as follows: >>> def words(self, fileids=None): ... return concat([self.CorpusView(fileid, self._read_word_block) ... for fileid in self.abspaths(fileids)]) This method first uses `abspaths()` to convert ``fileids`` to a list of absolute paths. It then creates a corpus view for each file, using the `PlaintextCorpusReader._read_word_block()` method to read elements from the data file (see the discussion of corpus views below). Finally, it combines these corpus views using the `nltk.corpus.reader.util.concat()` function. When writing a corpus reader for a corpus that is never expected to be very large, it can sometimes be appropriate to read the files directly, rather than using a corpus view. For example, the `WordListCorpusView` class defines its `words()` method as follows: >>> def words(self, fileids=None): ... return concat([[w for w in open(fileid).read().split('\n') if w] ... for fileid in self.abspaths(fileids)]) (This is usually more appropriate for lexicons than for token corpora.) If the type of data returned by a data access method is one for which NLTK has a conventional representation (e.g., words, tagged words, and parse trees), then you should use that representation. Otherwise, you may find it necessary to define your own representation. For data structures that are relatively corpus-specific, it's usually best to define new classes for these elements. For example, the ``propbank`` corpus defines the `PropbankInstance` class to store the semantic role labeling instances described by the corpus; and the ``ppattach`` corpus defines the `PPAttachment` class to store the prepositional attachment instances described by the corpus. Corpus Views ~~~~~~~~~~~~ .. (Much of the content for this section is taken from the StreamBackedCorpusView docstring.) The heart of a `StreamBackedCorpusView` is its *block reader* function, which reads zero or more tokens from a stream, and returns them as a list. A very simple example of a block reader is: >>> def simple_block_reader(stream): ... return stream.readline().split() This simple block reader reads a single line at a time, and returns a single token (consisting of a string) for each whitespace-separated substring on the line. A `StreamBackedCorpusView` built from this block reader will act like a read-only list of all the whitespace-separated tokens in an underlying file. When deciding how to define the block reader for a given corpus, careful consideration should be given to the size of blocks handled by the block reader. Smaller block sizes will increase the memory requirements of the corpus view's internal data structures (by 2 integers per block). On the other hand, larger block sizes may decrease performance for random access to the corpus. (But note that larger block sizes will *not* decrease performance for iteration.) Internally, the `StreamBackedCorpusView` class maintains a partial mapping from token index to file position, with one entry per block. When a token with a given index *i* is requested, the corpus view constructs it as follows: 1. First, it searches the toknum/filepos mapping for the token index closest to (but less than or equal to) *i*. 2. Then, starting at the file position corresponding to that index, it reads one block at a time using the block reader until it reaches the requested token. The toknum/filepos mapping is created lazily: it is initially empty, but every time a new block is read, the block's initial token is added to the mapping. (Thus, the toknum/filepos map has one entry per block.) You can create your own corpus view in one of two ways: 1. Call the `StreamBackedCorpusView` constructor, and provide your block reader function via the ``block_reader`` argument. 2. Subclass `StreamBackedCorpusView`, and override the `read_block()` method. The first option is usually easier, but the second option can allow you to write a single `read_block` method whose behavior can be customized by different parameters to the subclass's constructor. For an example of this design pattern, see the `TaggedCorpusView` class, which is used by `TaggedCorpusView`. ---------------- Regression Tests ---------------- The following helper functions are used to create and then delete testing corpora that are stored in temporary directories. These testing corpora are used to make sure the readers work correctly. >>> import tempfile, os.path, textwrap >>> def make_testcorpus(ext='', **fileids): ... root = tempfile.mkdtemp() ... for fileid, contents in fileids.items(): ... fileid += ext ... f = open(os.path.join(root, fileid), 'w') ... f.write(textwrap.dedent(contents)) ... f.close() ... return root >>> def del_testcorpus(root): ... for fileid in os.listdir(root): ... os.remove(os.path.join(root, fileid)) ... os.rmdir(root) Plaintext Corpus Reader ======================= The plaintext corpus reader is used to access corpora that consist of unprocessed plaintext data. It assumes that paragraph breaks are indicated by blank lines. Sentences and words can be tokenized using the default tokenizers, or by custom tokenizers specified as parameters to the constructor. >>> root = make_testcorpus(ext='.txt', ... a="""\ ... This is the first sentence. Here is another ... sentence! And here's a third sentence. ... ... This is the second paragraph. Tokenization is currently ... fairly simple, so the period in Mr. gets tokenized. ... """, ... b="""This is the second file.""") >>> from nltk.corpus.reader.plaintext import PlaintextCorpusReader The list of documents can be specified explicitly, or implicitly (using a regexp). The ``ext`` argument specifies a file extension. >>> corpus = PlaintextCorpusReader(root, ['a.txt', 'b.txt']) >>> corpus.fileids() ['a.txt', 'b.txt'] >>> corpus = PlaintextCorpusReader(root, '.*\.txt') >>> corpus.fileids() ['a.txt', 'b.txt'] The directory containing the corpus is corpus.root: >>> str(corpus.root) == str(root) True We can get a list of words, or the raw string: >>> corpus.words() ['This', 'is', 'the', 'first', 'sentence', '.', ...] >>> corpus.raw()[:40] 'This is the first sentence. Here is ano' Check that reading individual documents works, and reading all documents at once works: >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()] (46, [40, 6]) >>> corpus.words('a.txt') ['This', 'is', 'the', 'first', 'sentence', '.', ...] >>> corpus.words('b.txt') ['This', 'is', 'the', 'second', 'file', '.'] >>> corpus.words()[:4], corpus.words()[-4:] (['This', 'is', 'the', 'first'], ['the', 'second', 'file', '.']) We're done with the test corpus: >>> del_testcorpus(root) Test the plaintext corpora that come with nltk: >>> from nltk.corpus import abc, genesis, inaugural >>> from nltk.corpus import state_union, webtext >>> for corpus in (abc, genesis, inaugural, state_union, ... webtext): ... print(str(corpus).replace('\\\\','/')) ... print(' ', repr(corpus.fileids())[:60]) ... print(' ', repr(corpus.words()[:10])[:60]) ['rural.txt', 'science.txt'] ['PM', 'denies', 'knowledge', 'of', 'AWB', ... ['english-kjv.txt', 'english-web.txt', 'finnish.txt', ... ['In', 'the', 'beginning', 'God', 'created', 'the', ... ['1789-Washington.txt', '1793-Washington.txt', ... ['Fellow', '-', 'Citizens', 'of', 'the', 'Senate', ... ['1945-Truman.txt', '1946-Truman.txt', ... ['PRESIDENT', 'HARRY', 'S', '.', 'TRUMAN', "'", ... ['firefox.txt', 'grail.txt', 'overheard.txt', ... ['Cookie', 'Manager', ':', '"', 'Don', "'", 't', ... Tagged Corpus Reader ==================== The Tagged Corpus reader can give us words, sentences, and paragraphs, each tagged or untagged. All of the read methods can take one item (in which case they return the contents of that file) or a list of documents (in which case they concatenate the contents of those files). By default, they apply to all documents in the corpus. >>> root = make_testcorpus( ... a="""\ ... This/det is/verb the/det first/adj sentence/noun ./punc ... Here/det is/verb another/adj sentence/noun ./punc ... Note/verb that/comp you/pron can/verb use/verb \ ... any/noun tag/noun set/noun ... ... This/det is/verb the/det second/adj paragraph/noun ./punc ... word/n without/adj a/det tag/noun :/: hello ./punc ... """, ... b="""\ ... This/det is/verb the/det second/adj file/noun ./punc ... """) >>> from nltk.corpus.reader.tagged import TaggedCorpusReader >>> corpus = TaggedCorpusReader(root, list('ab')) >>> corpus.fileids() ['a', 'b'] >>> str(corpus.root) == str(root) True >>> corpus.words() ['This', 'is', 'the', 'first', 'sentence', '.', ...] >>> corpus.sents() # doctest: +ELLIPSIS [['This', 'is', 'the', 'first', ...], ['Here', 'is', 'another'...], ...] >>> corpus.paras() # doctest: +ELLIPSIS [[['This', ...], ['Here', ...], ...], [['This', ...], ...], ...] >>> corpus.tagged_words() # doctest: +ELLIPSIS [('This', 'DET'), ('is', 'VERB'), ('the', 'DET'), ...] >>> corpus.tagged_sents() # doctest: +ELLIPSIS [[('This', 'DET'), ('is', 'VERB'), ...], [('Here', 'DET'), ...], ...] >>> corpus.tagged_paras() # doctest: +ELLIPSIS [[[('This', 'DET'), ...], ...], [[('This', 'DET'), ...], ...], ...] >>> corpus.raw()[:40] 'This/det is/verb the/det first/adj sente' >>> len(corpus.words()), [len(corpus.words(d)) for d in corpus.fileids()] (38, [32, 6]) >>> len(corpus.sents()), [len(corpus.sents(d)) for d in corpus.fileids()] (6, [5, 1]) >>> len(corpus.paras()), [len(corpus.paras(d)) for d in corpus.fileids()] (3, [2, 1]) >>> print(corpus.words('a')) ['This', 'is', 'the', 'first', 'sentence', '.', ...] >>> print(corpus.words('b')) ['This', 'is', 'the', 'second', 'file', '.'] >>> del_testcorpus(root) The Brown Corpus uses the tagged corpus reader: >>> from nltk.corpus import brown >>> brown.fileids() # doctest: +ELLIPSIS ['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', ...] >>> brown.categories() # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] >>> print(repr(brown.root).replace('\\\\','/')) # doctest: +ELLIPSIS FileSystemPathPointer('.../corpora/brown') >>> brown.words() ['The', 'Fulton', 'County', 'Grand', 'Jury', ...] >>> brown.sents() # doctest: +ELLIPSIS [['The', 'Fulton', 'County', 'Grand', ...], ...] >>> brown.paras() # doctest: +ELLIPSIS [[['The', 'Fulton', 'County', ...]], [['The', 'jury', ...]], ...] >>> brown.tagged_words() # doctest: +ELLIPSIS [('The', 'AT'), ('Fulton', 'NP-TL'), ...] >>> brown.tagged_sents() # doctest: +ELLIPSIS [[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ...], ...] >>> brown.tagged_paras() # doctest: +ELLIPSIS [[[('The', 'AT'), ...]], [[('The', 'AT'), ...]], ...] Verbnet Corpus Reader ===================== Make sure we're picking up the right number of elements: >>> from nltk.corpus import verbnet >>> len(verbnet.lemmas()) 3621 >>> len(verbnet.wordnetids()) 4953 >>> len(verbnet.classids()) 429 Selecting classids based on various selectors: >>> verbnet.classids(lemma='take') # doctest: +NORMALIZE_WHITESPACE ['bring-11.3', 'characterize-29.2', 'convert-26.6.2', 'cost-54.2', 'fit-54.3', 'performance-26.7-2', 'steal-10.5'] >>> verbnet.classids(wordnetid='lead%2:38:01') ['accompany-51.7'] >>> verbnet.classids(fileid='approve-77.xml') ['approve-77'] >>> verbnet.classids(classid='admire-31.2') # subclasses ['admire-31.2-1'] vnclass() accepts filenames, long ids, and short ids: >>> a = ElementTree.tostring(verbnet.vnclass('admire-31.2.xml')) >>> b = ElementTree.tostring(verbnet.vnclass('admire-31.2')) >>> c = ElementTree.tostring(verbnet.vnclass('31.2')) >>> a == b == c True fileids() can be used to get files based on verbnet class ids: >>> verbnet.fileids('admire-31.2') ['admire-31.2.xml'] >>> verbnet.fileids(['admire-31.2', 'obtain-13.5.2']) ['admire-31.2.xml', 'obtain-13.5.2.xml'] >>> verbnet.fileids('badidentifier') Traceback (most recent call last): . . . ValueError: vnclass identifier 'badidentifier' not found longid() and shortid() can be used to convert identifiers: >>> verbnet.longid('31.2') 'admire-31.2' >>> verbnet.longid('admire-31.2') 'admire-31.2' >>> verbnet.shortid('31.2') '31.2' >>> verbnet.shortid('admire-31.2') '31.2' >>> verbnet.longid('badidentifier') Traceback (most recent call last): . . . ValueError: vnclass identifier 'badidentifier' not found >>> verbnet.shortid('badidentifier') Traceback (most recent call last): . . . ValueError: vnclass identifier 'badidentifier' not found Corpus View Regression Tests ============================ Select some corpus files to play with: >>> import nltk.data >>> # A very short file (160 chars): >>> f1 = nltk.data.find('corpora/inaugural/README') >>> # A relatively short file (791 chars): >>> f2 = nltk.data.find('corpora/inaugural/1793-Washington.txt') >>> # A longer file (32k chars): >>> f3 = nltk.data.find('corpora/inaugural/1909-Taft.txt') >>> fileids = [f1, f2, f3] Concatenation ------------- Check that concatenation works as intended. >>> from nltk.corpus.reader.util import * >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8') >>> c2 = StreamBackedCorpusView(f2, read_whitespace_block, encoding='utf-8') >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8') >>> c123 = c1+c2+c3 >>> print(c123) ['C-Span', 'Inaugural', 'Address', 'Corpus', 'US', ...] >>> l1 = f1.open(encoding='utf-8').read().split() >>> l2 = f2.open(encoding='utf-8').read().split() >>> l3 = f3.open(encoding='utf-8').read().split() >>> l123 = l1+l2+l3 >>> list(c123) == l123 True >>> (c1+c2+c3)[100] == l123[100] True Slicing ------- First, do some tests with fairly small slices. These will all generate tuple values. >>> from nltk.util import LazySubsequence >>> c1 = StreamBackedCorpusView(f1, read_whitespace_block, encoding='utf-8') >>> l1 = f1.open(encoding='utf-8').read().split() >>> print(len(c1)) 21 >>> len(c1) < LazySubsequence.MIN_SIZE True Choose a list of indices, based on the length, that covers the important corner cases: >>> indices = [-60, -30, -22, -21, -20, -1, ... 0, 1, 10, 20, 21, 22, 30, 60] Test slicing with explicit start & stop value: >>> for s in indices: ... for e in indices: ... assert list(c1[s:e]) == l1[s:e] Test slicing with stop=None: >>> for s in indices: ... assert list(c1[s:]) == l1[s:] Test slicing with start=None: >>> for e in indices: ... assert list(c1[:e]) == l1[:e] Test slicing with start=stop=None: >>> list(c1[:]) == list(l1[:]) True Next, we'll do some tests with much longer slices. These will generate LazySubsequence objects. >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block, encoding='utf-8') >>> l3 = f3.open(encoding='utf-8').read().split() >>> print(len(c3)) 5430 >>> len(c3) > LazySubsequence.MIN_SIZE*2 True Choose a list of indices, based on the length, that covers the important corner cases: >>> indices = [-12000, -6000, -5431, -5430, -5429, -3000, -200, -1, ... 0, 1, 200, 3000, 5000, 5429, 5430, 5431, 6000, 12000] Test slicing with explicit start & stop value: >>> for s in indices: ... for e in indices: ... assert list(c3[s:e]) == l3[s:e] Test slicing with stop=None: >>> for s in indices: ... assert list(c3[s:]) == l3[s:] Test slicing with start=None: >>> for e in indices: ... assert list(c3[:e]) == l3[:e] Test slicing with start=stop=None: >>> list(c3[:]) == list(l3[:]) True Multiple Iterators ------------------ If multiple iterators are created for the same corpus view, their iteration can be interleaved: >>> c3 = StreamBackedCorpusView(f3, read_whitespace_block) >>> iterators = [c3.iterate_from(n) for n in [0,15,30,45]] >>> for i in range(15): ... for iterator in iterators: ... print('%-15s' % next(iterator), end=' ') ... print() My a duties in fellow heavy of a citizens: weight the proper Anyone of office sense who responsibility. upon of has If which the taken not, he obligation the he is which oath has about the I no to oath have conception enter, imposes. just of or The taken the he office must powers is of feel and lacking an SeekableUnicodeStreamReader =========================== The file-like objects provided by the ``codecs`` module unfortunately suffer from a bug that prevents them from working correctly with corpus view objects. In particular, although the expose ``seek()`` and ``tell()`` methods, those methods do not exhibit the expected behavior, because they are not synchronized with the internal buffers that are kept by the file-like objects. For example, the ``tell()`` method will return the file position at the end of the buffers (whose contents have not yet been returned by the stream); and therefore this file position can not be used to return to the 'current' location in the stream (since ``seek()`` has no way to reconstruct the buffers). To get around these problems, we define a new class, `SeekableUnicodeStreamReader`, to act as a file-like interface to files containing encoded unicode data. This class is loosely based on the ``codecs.StreamReader`` class. To construct a new reader, we call the constructor with an underlying stream and an encoding name: >>> from io import StringIO, BytesIO >>> from nltk.data import SeekableUnicodeStreamReader >>> stream = BytesIO(b"""\ ... This is a test file. ... It is encoded in ascii. ... """.decode('ascii').encode('ascii')) >>> reader = SeekableUnicodeStreamReader(stream, 'ascii') `SeekableUnicodeStreamReader`\ s support all of the normal operations supplied by a read-only stream. Note that all of the read operations return ``unicode`` objects (not ``str`` objects). >>> reader.read() # read the entire file. u'This is a test file.\nIt is encoded in ascii.\n' >>> reader.seek(0) # rewind to the start. >>> reader.read(5) # read at most 5 bytes. u'This ' >>> reader.readline() # read to the end of the line. u'is a test file.\n' >>> reader.seek(0) # rewind to the start. >>> for line in reader: ... print(repr(line)) # iterate over lines u'This is a test file.\n' u'It is encoded in ascii.\n' >>> reader.seek(0) # rewind to the start. >>> reader.readlines() # read a list of line strings [u'This is a test file.\n', u'It is encoded in ascii.\n'] >>> reader.close() Size argument to ``read()`` --------------------------- The ``size`` argument to ``read()`` specifies the maximum number of *bytes* to read, not the maximum number of *characters*. Thus, for encodings that use multiple bytes per character, it may return fewer characters than the ``size`` argument: >>> stream = BytesIO(b"""\ ... This is a test file. ... It is encoded in utf-16. ... """.decode('ascii').encode('utf-16')) >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') >>> reader.read(10) u'This ' If a read block ends in the middle of the byte string encoding a single character, then that byte string is stored in an internal buffer, and re-used on the next call to ``read()``. However, if the size argument is too small to read even a single character, even though at least one character is available, then the ``read()`` method will read additional bytes until it can return a single character. This ensures that the ``read()`` method does not return an empty string, which could be mistaken for indicating the end of the file. >>> reader.seek(0) # rewind to the start. >>> reader.read(1) # we actually need to read 4 bytes u'T' >>> int(reader.tell()) 4 The ``readline()`` method may read more than a single line of text, in which case it stores the text that it does not return in a buffer. If this buffer is not empty, then its contents will be included in the value returned by the next call to ``read()``, regardless of the ``size`` argument, since they are available without reading any new bytes from the stream: >>> reader.seek(0) # rewind to the start. >>> reader.readline() # stores extra text in a buffer u'This is a test file.\n' >>> print(reader.linebuffer) # examine the buffer contents [u'It is encoded i'] >>> reader.read(0) # returns the contents of the buffer u'It is encoded i' >>> print(reader.linebuffer) # examine the buffer contents None Seek and Tell ------------- In addition to these basic read operations, `SeekableUnicodeStreamReader` also supports the ``seek()`` and ``tell()`` operations. However, some care must still be taken when using these operations. In particular, the only file offsets that should be passed to ``seek()`` are ``0`` and any offset that has been returned by ``tell``. >>> stream = BytesIO(b"""\ ... This is a test file. ... It is encoded in utf-16. ... """.decode('ascii').encode('utf-16')) >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') >>> reader.read(20) u'This is a ' >>> pos = reader.tell(); print(pos) 22 >>> reader.read(20) u'test file.' >>> reader.seek(pos) # rewind to the position from tell. >>> reader.read(20) u'test file.' The ``seek()`` and ``tell()`` methods work property even when ``readline()`` is used. >>> stream = BytesIO(b"""\ ... This is a test file. ... It is encoded in utf-16. ... """.decode('ascii').encode('utf-16')) >>> reader = SeekableUnicodeStreamReader(stream, 'utf-16') >>> reader.readline() u'This is a test file.\n' >>> pos = reader.tell(); print(pos) 44 >>> reader.readline() u'It is encoded in utf-16.\n' >>> reader.seek(pos) # rewind to the position from tell. >>> reader.readline() u'It is encoded in utf-16.\n' Squashed Bugs ============= svn 5276 fixed a bug in the comment-stripping behavior of parse_sexpr_block. >>> from io import StringIO >>> from nltk.corpus.reader.util import read_sexpr_block >>> f = StringIO(b""" ... (a b c) ... # This line is a comment. ... (d e f\ng h)""".decode('ascii')) >>> print(read_sexpr_block(f, block_size=38, comment_char='#')) ['(a b c)'] >>> print(read_sexpr_block(f, block_size=38, comment_char='#')) ['(d e f\ng h)'] svn 5277 fixed a bug in parse_sexpr_block, which would cause it to enter an infinite loop if a file ended mid-sexpr, or ended with a token that was not followed by whitespace. A related bug caused an infinite loop if the corpus ended in an unmatched close paren -- this was fixed in svn 5279 >>> f = StringIO(b""" ... This file ends mid-sexpr ... (hello (world""".decode('ascii')) >>> for i in range(3): print(read_sexpr_block(f)) ['This', 'file', 'ends', 'mid-sexpr'] ['(hello (world'] [] >>> f = StringIO(b"This file has no trailing whitespace.".decode('ascii')) >>> for i in range(3): print(read_sexpr_block(f)) ['This', 'file', 'has', 'no', 'trailing'] ['whitespace.'] [] >>> # Bug fixed in 5279: >>> f = StringIO(b"a b c)".decode('ascii')) >>> for i in range(3): print(read_sexpr_block(f)) ['a', 'b'] ['c)'] [] svn 5624 & 5265 fixed a bug in ConcatenatedCorpusView, which caused it to return the wrong items when indexed starting at any index beyond the first file. >>> import nltk >>> sents = nltk.corpus.brown.sents() >>> print(sents[6000]) ['Cholesterol', 'and', 'thyroid'] >>> print(sents[6000]) ['Cholesterol', 'and', 'thyroid'] svn 5728 fixed a bug in Categorized*CorpusReader, which caused them to return words from *all* files when just one file was specified. >>> from nltk.corpus import reuters >>> reuters.words('training/13085') ['SNYDER', '&', 'lt', ';', 'SOI', '>', 'MAKES', ...] >>> reuters.words('training/5082') ['SHEPPARD', 'RESOURCES', 'TO', 'MERGE', 'WITH', ...] svn 7227 fixed a bug in the qc corpus reader, which prevented access to its tuples() method >>> from nltk.corpus import qc >>> qc.tuples('test.txt') [('NUM:dist', 'How far is it from Denver to Aspen ?'), ('LOC:city', 'What county is Modesto , California in ?'), ...] nltk-3.1/nltk/test/corpus_fixt.py0000644000076500000240000000014712574600335016731 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import from nltk.corpus import teardown_modulenltk-3.1/nltk/test/crubadan.doctest0000644000076500000240000000377012607224144017162 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT Crubadan Corpus Reader ====================== Crubadan is an NLTK corpus reader for ngram files provided by the Crubadan project. It supports several languages. >>> from nltk.corpus import crubadan >>> crubadan.langs() # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE ['abk', 'abn',..., 'zpa', 'zul'] ---------------------------------------- Language code mapping and helper methods ---------------------------------------- The web crawler that generates the 3-gram frequencies works at the level of "writing systems" rather than languages. Writing systems are assigned internal 2-3 letter codes that require mapping to the standard ISO 639-3 codes. For more information, please refer to the README in nltk_data/crubadan folder after installing it. To translate ISO 639-3 codes to "Crubadan Code": >>> crubadan.iso_to_crubadan('eng') 'en' >>> crubadan.iso_to_crubadan('fra') 'fr' >>> crubadan.iso_to_crubadan('aaa') In reverse, print ISO 639-3 code if we have the Crubadan Code: >>> crubadan.crubadan_to_iso('en') 'eng' >>> crubadan.crubadan_to_iso('fr') 'fra' >>> crubadan.crubadan_to_iso('aa') --------------------------- Accessing ngram frequencies --------------------------- On initialization the reader will create a dictionary of every language supported by the Crubadan project, mapping the ISO 639-3 language code to its corresponding ngram frequency. You can access individual language FreqDist and the ngrams within them as follows: >>> english_fd = crubadan.lang_freq('eng') >>> english_fd['the'] 728135 Above accesses the FreqDist of English and returns the frequency of the ngram 'the'. A ngram that isn't found within the language will return 0: >>> english_fd['sometest'] 0 A language that isn't supported will raise an exception: >>> crubadan.lang_freq('elvish') Traceback (most recent call last): ... RuntimeError: Unsupported language. nltk-3.1/nltk/test/data.doctest0000644000076500000240000003317012607224144016311 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ========================================= Loading Resources From the Data Package ========================================= >>> import nltk.data Overview ~~~~~~~~ The `nltk.data` module contains functions that can be used to load NLTK resource files, such as corpora, grammars, and saved processing objects. Loading Data Files ~~~~~~~~~~~~~~~~~~ Resources are loaded using the function `nltk.data.load()`, which takes as its first argument a URL specifying what file should be loaded. The ``nltk:`` protocol loads files from the NLTK data distribution: >>> from __future__ import print_function >>> tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') >>> tokenizer.tokenize('Hello. This is a test. It works!') ['Hello.', 'This is a test.', 'It works!'] It is important to note that there should be no space following the colon (':') in the URL; 'nltk: tokenizers/punkt/english.pickle' will not work! The ``nltk:`` protocol is used by default if no protocol is specified: >>> nltk.data.load('tokenizers/punkt/english.pickle') # doctest: +ELLIPSIS But it is also possible to load resources from ``http:``, ``ftp:``, and ``file:`` URLs, e.g. ``cfg = nltk.data.load('http://example.com/path/to/toy.cfg')`` >>> # Load a grammar using an absolute path. >>> url = 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg') >>> url.replace('\\', '/') # doctest: +ELLIPSIS 'file:...toy.cfg' >>> print(nltk.data.load(url)) # doctest: +ELLIPSIS Grammar with 14 productions (start state = S) S -> NP VP PP -> P NP ... P -> 'on' P -> 'in' The second argument to the `nltk.data.load()` function specifies the file format, which determines how the file's contents are processed before they are returned by ``load()``. The formats that are currently supported by the data module are described by the dictionary `nltk.data.FORMATS`: >>> for format, descr in sorted(nltk.data.FORMATS.items()): ... print('{0:<7} {1:}'.format(format, descr)) # doctest: +NORMALIZE_WHITESPACE cfg A context free grammar. fcfg A feature CFG. fol A list of first order logic expressions, parsed with nltk.sem.logic.Expression.fromstring. json A serialized python object, stored using the json module. logic A list of first order logic expressions, parsed with nltk.sem.logic.LogicParser. Requires an additional logic_parser parameter pcfg A probabilistic CFG. pickle A serialized python object, stored using the pickle module. raw The raw (byte string) contents of a file. text The raw (unicode string) contents of a file. val A semantic valuation, parsed by nltk.sem.Valuation.fromstring. yaml A serialized python object, stored using the yaml module. `nltk.data.load()` will raise a ValueError if a bad format name is specified: >>> nltk.data.load('grammars/sample_grammars/toy.cfg', 'bar') Traceback (most recent call last): . . . ValueError: Unknown format type! By default, the ``"auto"`` format is used, which chooses a format based on the filename's extension. The mapping from file extensions to format names is specified by `nltk.data.AUTO_FORMATS`: >>> for ext, format in sorted(nltk.data.AUTO_FORMATS.items()): ... print('.%-7s -> %s' % (ext, format)) .cfg -> cfg .fcfg -> fcfg .fol -> fol .json -> json .logic -> logic .pcfg -> pcfg .pickle -> pickle .text -> text .txt -> text .val -> val .yaml -> yaml If `nltk.data.load()` is unable to determine the format based on the filename's extension, it will raise a ValueError: >>> nltk.data.load('foo.bar') Traceback (most recent call last): . . . ValueError: Could not determine format for foo.bar based on its file extension; use the "format" argument to specify the format explicitly. Note that by explicitly specifying the ``format`` argument, you can override the load method's default processing behavior. For example, to get the raw contents of any file, simply use ``format="raw"``: >>> s = nltk.data.load('grammars/sample_grammars/toy.cfg', 'text') >>> print(s) # doctest: +ELLIPSIS S -> NP VP PP -> P NP NP -> Det N | NP PP VP -> V NP | VP PP ... Making Local Copies ~~~~~~~~~~~~~~~~~~~ .. This will not be visible in the html output: create a tempdir to play in. >>> import tempfile, os >>> tempdir = tempfile.mkdtemp() >>> old_dir = os.path.abspath('.') >>> os.chdir(tempdir) The function `nltk.data.retrieve()` copies a given resource to a local file. This can be useful, for example, if you want to edit one of the sample grammars. >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg') Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy.cfg' >>> # Simulate editing the grammar. >>> with open('toy.cfg') as inp: ... s = inp.read().replace('NP', 'DP') >>> with open('toy.cfg', 'w') as out: ... _bytes_written = out.write(s) >>> # Load the edited grammar, & display it. >>> cfg = nltk.data.load('file:///' + os.path.abspath('toy.cfg')) >>> print(cfg) # doctest: +ELLIPSIS Grammar with 14 productions (start state = S) S -> DP VP PP -> P DP ... P -> 'on' P -> 'in' The second argument to `nltk.data.retrieve()` specifies the filename for the new copy of the file. By default, the source file's filename is used. >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg', 'mytoy.cfg') Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'mytoy.cfg' >>> os.path.isfile('./mytoy.cfg') True >>> nltk.data.retrieve('grammars/sample_grammars/np.fcfg') Retrieving 'nltk:grammars/sample_grammars/np.fcfg', saving to 'np.fcfg' >>> os.path.isfile('./np.fcfg') True If a file with the specified (or default) filename already exists in the current directory, then `nltk.data.retrieve()` will raise a ValueError exception. It will *not* overwrite the file: >>> os.path.isfile('./toy.cfg') True >>> nltk.data.retrieve('grammars/sample_grammars/toy.cfg') # doctest: +ELLIPSIS Traceback (most recent call last): . . . ValueError: File '...toy.cfg' already exists! .. This will not be visible in the html output: clean up the tempdir. >>> os.chdir(old_dir) >>> for f in os.listdir(tempdir): ... os.remove(os.path.join(tempdir, f)) >>> os.rmdir(tempdir) Finding Files in the NLTK Data Package ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The `nltk.data.find()` function searches the NLTK data package for a given file, and returns a pointer to that file. This pointer can either be a `FileSystemPathPointer` (whose `path` attribute gives the absolute path of the file); or a `ZipFilePathPointer`, specifying a zipfile and the name of an entry within that zipfile. Both pointer types define the `open()` method, which can be used to read the string contents of the file. >>> path = nltk.data.find('corpora/abc/rural.txt') >>> str(path) # doctest: +ELLIPSIS '...rural.txt' >>> print(path.open().read(60).decode()) PM denies knowledge of AWB kickbacks The Prime Minister has Alternatively, the `nltk.data.load()` function can be used with the keyword argument ``format="raw"``: >>> s = nltk.data.load('corpora/abc/rural.txt', format='raw')[:60] >>> print(s.decode()) PM denies knowledge of AWB kickbacks The Prime Minister has Alternatively, you can use the keyword argument ``format="text"``: >>> s = nltk.data.load('corpora/abc/rural.txt', format='text')[:60] >>> print(s) PM denies knowledge of AWB kickbacks The Prime Minister has Resource Caching ~~~~~~~~~~~~~~~~ NLTK uses a weakref dictionary to maintain a cache of resources that have been loaded. If you load a resource that is already stored in the cache, then the cached copy will be returned. This behavior can be seen by the trace output generated when verbose=True: >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True) <> >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', verbose=True) <> If you wish to load a resource from its source, bypassing the cache, use the ``cache=False`` argument to `nltk.data.load()`. This can be useful, for example, if the resource is loaded from a local file, and you are actively editing that file: >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg',cache=False,verbose=True) <> The cache *no longer* uses weak references. A resource will not be automatically expunged from the cache when no more objects are using it. In the following example, when we clear the variable ``feat0``, the reference count for the feature grammar object drops to zero. However, the object remains cached: >>> del feat0 >>> feat0 = nltk.data.load('grammars/book_grammars/feat0.fcfg', ... verbose=True) <> You can clear the entire contents of the cache, using `nltk.data.clear_cache()`: >>> nltk.data.clear_cache() Retrieving other Data Sources ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> formulas = nltk.data.load('grammars/book_grammars/background.fol') >>> for f in formulas: print(str(f)) all x.(boxerdog(x) -> dog(x)) all x.(boxer(x) -> person(x)) all x.-(dog(x) & person(x)) all x.(married(x) <-> exists y.marry(x,y)) all x.(bark(x) -> dog(x)) all x y.(marry(x,y) -> (person(x) & person(y))) -(Vincent = Mia) -(Vincent = Fido) -(Mia = Fido) Regression Tests ~~~~~~~~~~~~~~~~ Create a temp dir for tests that write files: >>> import tempfile, os >>> tempdir = tempfile.mkdtemp() >>> old_dir = os.path.abspath('.') >>> os.chdir(tempdir) The `retrieve()` function accepts all url types: >>> urls = ['https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', ... 'file:%s' % nltk.data.find('grammars/sample_grammars/toy.cfg'), ... 'nltk:grammars/sample_grammars/toy.cfg', ... 'grammars/sample_grammars/toy.cfg'] >>> for i, url in enumerate(urls): ... nltk.data.retrieve(url, 'toy-%d.cfg' % i) # doctest: +ELLIPSIS Retrieving 'https://raw.githubusercontent.com/nltk/nltk/develop/nltk/test/toy.cfg', saving to 'toy-0.cfg' Retrieving 'file:...toy.cfg', saving to 'toy-1.cfg' Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-2.cfg' Retrieving 'nltk:grammars/sample_grammars/toy.cfg', saving to 'toy-3.cfg' Clean up the temp dir: >>> os.chdir(old_dir) >>> for f in os.listdir(tempdir): ... os.remove(os.path.join(tempdir, f)) >>> os.rmdir(tempdir) Lazy Loader ----------- A lazy loader is a wrapper object that defers loading a resource until it is accessed or used in any way. This is mainly intended for internal use by NLTK's corpus readers. >>> # Create a lazy loader for toy.cfg. >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg') >>> # Show that it's not loaded yet: >>> object.__repr__(ll) # doctest: +ELLIPSIS '' >>> # printing it is enough to cause it to be loaded: >>> print(ll) >>> # Show that it's now been loaded: >>> object.__repr__(ll) # doctest: +ELLIPSIS '' >>> # Test that accessing an attribute also loads it: >>> ll = nltk.data.LazyLoader('grammars/sample_grammars/toy.cfg') >>> ll.start() S >>> object.__repr__(ll) # doctest: +ELLIPSIS '' Buffered Gzip Reading and Writing --------------------------------- Write performance to gzip-compressed is extremely poor when the files become large. File creation can become a bottleneck in those cases. Read performance from large gzipped pickle files was improved in data.py by buffering the reads. A similar fix can be applied to writes by buffering the writes to a StringIO object first. This is mainly intended for internal use. The test simply tests that reading and writing work as intended and does not test how much improvement buffering provides. >>> from nltk.compat import StringIO >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'wb', size=2**10) >>> ans = [] >>> for i in range(10000): ... ans.append(str(i).encode('ascii')) ... test.write(str(i).encode('ascii')) >>> test.close() >>> test = nltk.data.BufferedGzipFile('testbuf.gz', 'rb') >>> test.read() == b''.join(ans) True >>> test.close() >>> import os >>> os.unlink('testbuf.gz') JSON Encoding and Decoding -------------------------- JSON serialization is used instead of pickle for some classes. >>> from nltk import jsontags >>> from nltk.jsontags import JSONTaggedEncoder, JSONTaggedDecoder, register_tag >>> @jsontags.register_tag ... class JSONSerializable: ... json_tag = 'JSONSerializable' ... ... def __init__(self, n): ... self.n = n ... ... def encode_json_obj(self): ... return self.n ... ... @classmethod ... def decode_json_obj(cls, obj): ... n = obj ... return cls(n) ... >>> JSONTaggedEncoder().encode(JSONSerializable(1)) '{"!JSONSerializable": 1}' >>> JSONTaggedDecoder().decode('{"!JSONSerializable": 1}').n 1 nltk-3.1/nltk/test/dependency.doctest0000755000076500000240000001651012607224144017520 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT =================== Dependency Grammars =================== >>> from nltk.grammar import DependencyGrammar >>> from nltk.parse import ( ... DependencyGraph, ... ProjectiveDependencyParser, ... NonprojectiveDependencyParser, ... ) CoNLL Data ---------- >>> treebank_data = """Pierre NNP 2 NMOD ... Vinken NNP 8 SUB ... , , 2 P ... 61 CD 5 NMOD ... years NNS 6 AMOD ... old JJ 2 NMOD ... , , 2 P ... will MD 0 ROOT ... join VB 8 VC ... the DT 11 NMOD ... board NN 9 OBJ ... as IN 9 VMOD ... a DT 15 NMOD ... nonexecutive JJ 15 NMOD ... director NN 12 PMOD ... Nov. NNP 9 VMOD ... 29 CD 16 NMOD ... . . 9 VMOD ... """ >>> dg = DependencyGraph(treebank_data) >>> dg.tree().pprint() (will (Vinken Pierre , (old (years 61)) ,) (join (board the) (as (director a nonexecutive)) (Nov. 29) .)) >>> for head, rel, dep in dg.triples(): ... print( ... '({h[0]}, {h[1]}), {r}, ({d[0]}, {d[1]})' ... .format(h=head, r=rel, d=dep) ... ) (will, MD), SUB, (Vinken, NNP) (Vinken, NNP), NMOD, (Pierre, NNP) (Vinken, NNP), P, (,, ,) (Vinken, NNP), NMOD, (old, JJ) (old, JJ), AMOD, (years, NNS) (years, NNS), NMOD, (61, CD) (Vinken, NNP), P, (,, ,) (will, MD), VC, (join, VB) (join, VB), OBJ, (board, NN) (board, NN), NMOD, (the, DT) (join, VB), VMOD, (as, IN) (as, IN), PMOD, (director, NN) (director, NN), NMOD, (a, DT) (director, NN), NMOD, (nonexecutive, JJ) (join, VB), VMOD, (Nov., NNP) (Nov., NNP), NMOD, (29, CD) (join, VB), VMOD, (., .) Using a custom cell extractor. >>> def custom_extractor(cells): ... _, tag, head, rel = cells ... return 'spam', 'spam', tag, tag, '', head, rel >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) >>> dg.tree().pprint() (spam (spam spam spam (spam (spam spam)) spam) (spam (spam spam) (spam (spam spam spam)) (spam spam) spam)) Custom cell extractors can take in and return an index. >>> def custom_extractor(cells, index): ... word, tag, head, rel = cells ... return (index, '{}-{}'.format(word, index), word, ... tag, tag, '', head, rel) >>> dg = DependencyGraph(treebank_data, cell_extractor=custom_extractor) >>> dg.tree().pprint() (will-8 (Vinken-2 Pierre-1 ,-3 (old-6 (years-5 61-4)) ,-7) (join-9 (board-11 the-10) (as-12 (director-15 a-13 nonexecutive-14)) (Nov.-16 29-17) .-18)) Using the dependency-parsed version of the Penn Treebank corpus sample. >>> from nltk.corpus import dependency_treebank >>> t = dependency_treebank.parsed_sents()[0] >>> print(t.to_conll(3)) # doctest: +NORMALIZE_WHITESPACE Pierre NNP 2 Vinken NNP 8 , , 2 61 CD 5 years NNS 6 old JJ 2 , , 2 will MD 0 join VB 8 the DT 11 board NN 9 as IN 9 a DT 15 nonexecutive JJ 15 director NN 12 Nov. NNP 9 29 CD 16 . . 8 Using the output of zpar (like Malt-TAB but with zero-based indexing) >>> zpar_data = """ ... Pierre NNP 1 NMOD ... Vinken NNP 7 SUB ... , , 1 P ... 61 CD 4 NMOD ... years NNS 5 AMOD ... old JJ 1 NMOD ... , , 1 P ... will MD -1 ROOT ... join VB 7 VC ... the DT 10 NMOD ... board NN 8 OBJ ... as IN 8 VMOD ... a DT 14 NMOD ... nonexecutive JJ 14 NMOD ... director NN 11 PMOD ... Nov. NNP 8 VMOD ... 29 CD 15 NMOD ... . . 7 P ... """ >>> zdg = DependencyGraph(zpar_data, zero_based=True) >>> print(zdg.tree()) (will (Vinken Pierre , (old (years 61)) ,) (join (board the) (as (director a nonexecutive)) (Nov. 29)) .) Projective Dependency Parsing ----------------------------- >>> grammar = DependencyGrammar.fromstring(""" ... 'fell' -> 'price' | 'stock' ... 'price' -> 'of' 'the' ... 'of' -> 'stock' ... 'stock' -> 'the' ... """) >>> print(grammar) Dependency grammar with 5 productions 'fell' -> 'price' 'fell' -> 'stock' 'price' -> 'of' 'the' 'of' -> 'stock' 'stock' -> 'the' >>> dp = ProjectiveDependencyParser(grammar) >>> for t in sorted(dp.parse(['the', 'price', 'of', 'the', 'stock', 'fell'])): ... print(t) (fell (price the (of (stock the)))) (fell (price the of) (stock the)) (fell (price the of the) stock) Non-Projective Dependency Parsing --------------------------------- >>> grammar = DependencyGrammar.fromstring(""" ... 'taught' -> 'play' | 'man' ... 'man' -> 'the' ... 'play' -> 'golf' | 'dog' | 'to' ... 'dog' -> 'his' ... """) >>> print(grammar) Dependency grammar with 7 productions 'taught' -> 'play' 'taught' -> 'man' 'man' -> 'the' 'play' -> 'golf' 'play' -> 'dog' 'play' -> 'to' 'dog' -> 'his' >>> dp = NonprojectiveDependencyParser(grammar) >>> g, = dp.parse(['the', 'man', 'taught', 'his', 'dog', 'to', 'play', 'golf']) >>> print(g.root['word']) taught >>> for _, node in sorted(g.nodes.items()): ... if node['word'] is not None: ... print('{address} {word}: {d}'.format(d=node['deps'][''], **node)) 1 the: [] 2 man: [1] 3 taught: [2, 7] 4 his: [] 5 dog: [4] 6 to: [] 7 play: [5, 6, 8] 8 golf: [] >>> print(g.tree()) (taught (man the) (play (dog his) to golf)) Integration with MALT parser ============================ In case the top relation is different from the default, we can set it. In case of MALT parser, it's set to `'null'`. >>> dg_str = """1 I _ NN NN _ 2 nn _ _ ... 2 shot _ NN NN _ 0 null _ _ ... 3 an _ AT AT _ 2 dep _ _ ... 4 elephant _ NN NN _ 7 nn _ _ ... 5 in _ NN NN _ 7 nn _ _ ... 6 my _ NN NN _ 7 nn _ _ ... 7 pajamas _ NNS NNS _ 3 dobj _ _ ... """ >>> dg = DependencyGraph(dg_str, top_relation_label='null') >>> len(dg.nodes) 8 >>> dg.root['word'], dg.root['address'] ('shot', 2) >>> print(dg.to_conll(10)) # doctest: +NORMALIZE_WHITESPACE 1 I _ NN NN _ 2 nn _ _ 2 shot _ NN NN _ 0 null _ _ 3 an _ AT AT _ 2 dep _ _ 4 elephant _ NN NN _ 7 nn _ _ 5 in _ NN NN _ 7 nn _ _ 6 my _ NN NN _ 7 nn _ _ 7 pajamas _ NNS NNS _ 3 dobj _ _ nltk-3.1/nltk/test/discourse.doctest0000644000076500000240000004207012607224144017377 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ================== Discourse Checking ================== >>> from nltk import * >>> from nltk.sem import logic >>> logic._counter._value = 0 Introduction ============ The NLTK discourse module makes it possible to test consistency and redundancy of simple discourses, using theorem-proving and model-building from `nltk.inference`. The ``DiscourseTester`` constructor takes a list of sentences as a parameter. >>> dt = DiscourseTester(['a boxer walks', 'every boxer chases a girl']) The ``DiscourseTester`` parses each sentence into a list of logical forms. Once we have created ``DiscourseTester`` object, we can inspect various properties of the discourse. First off, we might want to double-check what sentences are currently stored as the discourse. >>> dt.sentences() s0: a boxer walks s1: every boxer chases a girl As you will see, each sentence receives an identifier `s`\ :subscript:`i`. We might also want to check what grammar the ``DiscourseTester`` is using (by default, ``book_grammars/discourse.fcfg``): >>> dt.grammar() # doctest: +ELLIPSIS % start S # Grammar Rules S[SEM = ] -> NP[NUM=?n,SEM=?subj] VP[NUM=?n,SEM=?vp] NP[NUM=?n,SEM= ] -> Det[NUM=?n,SEM=?det] Nom[NUM=?n,SEM=?nom] NP[LOC=?l,NUM=?n,SEM=?np] -> PropN[LOC=?l,NUM=?n,SEM=?np] ... A different grammar can be invoked by using the optional ``gramfile`` parameter when a ``DiscourseTester`` object is created. Readings and Threads ==================== Depending on the grammar used, we may find some sentences have more than one logical form. To check this, use the ``readings()`` method. Given a sentence identifier of the form `s`\ :subscript:`i`, each reading of that sentence is given an identifier `s`\ :sub:`i`-`r`\ :sub:`j`. >>> dt.readings() s0 readings: s0-r0: exists z1.(boxer(z1) & walk(z1)) s0-r1: exists z1.(boxerdog(z1) & walk(z1)) s1 readings: s1-r0: all z2.(boxer(z2) -> exists z3.(girl(z3) & chase(z2,z3))) s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) In this case, the only source of ambiguity lies in the word *boxer*, which receives two translations: ``boxer`` and ``boxerdog``. The intention is that one of these corresponds to the ``person`` sense and one to the ``dog`` sense. In principle, we would also expect to see a quantifier scope ambiguity in ``s1``. However, the simple grammar we are using, namely `sem4.fcfg `_, doesn't support quantifier scope ambiguity. We can also investigate the readings of a specific sentence: >>> dt.readings('a boxer walks') The sentence 'a boxer walks' has these readings: exists x.(boxer(x) & walk(x)) exists x.(boxerdog(x) & walk(x)) Given that each sentence is two-ways ambiguous, we potentially have four different discourse 'threads', taking all combinations of readings. To see these, specify the ``threaded=True`` parameter on the ``readings()`` method. Again, each thread is assigned an identifier of the form `d`\ :sub:`i`. Following the identifier is a list of the readings that constitute that thread. >>> dt.readings(threaded=True) # doctest: +NORMALIZE_WHITESPACE d0: ['s0-r0', 's1-r0'] d1: ['s0-r0', 's1-r1'] d2: ['s0-r1', 's1-r0'] d3: ['s0-r1', 's1-r1'] Of course, this simple-minded approach doesn't scale: a discourse with, say, three sentences, each of which has 3 readings, will generate 27 different threads. It is an interesting exercise to consider how to manage discourse ambiguity more efficiently. Checking Consistency ==================== Now, we can check whether some or all of the discourse threads are consistent, using the ``models()`` method. With no parameter, this method will try to find a model for every discourse thread in the current discourse. However, we can also specify just one thread, say ``d1``. >>> dt.models('d1') -------------------------------------------------------------------------------- Model for Discourse Thread d1 -------------------------------------------------------------------------------- % number = 1 % seconds = 0 % Interpretation of size 2 c1 = 0. f1(0) = 0. f1(1) = 0. boxer(0). - boxer(1). - boxerdog(0). - boxerdog(1). - girl(0). - girl(1). walk(0). - walk(1). - chase(0,0). - chase(0,1). - chase(1,0). - chase(1,1). Consistent discourse: d1 ['s0-r0', 's1-r1']: s0-r0: exists z1.(boxer(z1) & walk(z1)) s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) There are various formats for rendering **Mace4** models --- here, we have used the 'cooked' format (which is intended to be human-readable). There are a number of points to note. #. The entities in the domain are all treated as non-negative integers. In this case, there are only two entities, ``0`` and ``1``. #. The ``-`` symbol indicates negation. So ``0`` is the only ``boxerdog`` and the only thing that ``walk``\ s. Nothing is a ``boxer``, or a ``girl`` or in the ``chase`` relation. Thus the universal sentence is vacuously true. #. ``c1`` is an introduced constant that denotes ``0``. #. ``f1`` is a Skolem function, but it plays no significant role in this model. We might want to now add another sentence to the discourse, and there is method ``add_sentence()`` for doing just this. >>> dt.add_sentence('John is a boxer') >>> dt.sentences() s0: a boxer walks s1: every boxer chases a girl s2: John is a boxer We can now test all the properties as before; here, we just show a couple of them. >>> dt.readings() s0 readings: s0-r0: exists z1.(boxer(z1) & walk(z1)) s0-r1: exists z1.(boxerdog(z1) & walk(z1)) s1 readings: s1-r0: all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2))) s1-r1: all z1.(boxerdog(z1) -> exists z2.(girl(z2) & chase(z1,z2))) s2 readings: s2-r0: boxer(John) s2-r1: boxerdog(John) >>> dt.readings(threaded=True) # doctest: +NORMALIZE_WHITESPACE d0: ['s0-r0', 's1-r0', 's2-r0'] d1: ['s0-r0', 's1-r0', 's2-r1'] d2: ['s0-r0', 's1-r1', 's2-r0'] d3: ['s0-r0', 's1-r1', 's2-r1'] d4: ['s0-r1', 's1-r0', 's2-r0'] d5: ['s0-r1', 's1-r0', 's2-r1'] d6: ['s0-r1', 's1-r1', 's2-r0'] d7: ['s0-r1', 's1-r1', 's2-r1'] If you are interested in a particular thread, the ``expand_threads()`` method will remind you of what readings it consists of: >>> thread = dt.expand_threads('d1') >>> for rid, reading in thread: ... print(rid, str(reading.normalize())) s0-r0 exists z1.(boxer(z1) & walk(z1)) s1-r0 all z1.(boxer(z1) -> exists z2.(girl(z2) & chase(z1,z2))) s2-r1 boxerdog(John) Suppose we have already defined a discourse, as follows: >>> dt = DiscourseTester(['A student dances', 'Every student is a person']) Now, when we add a new sentence, is it consistent with what we already have? The `` consistchk=True`` parameter of ``add_sentence()`` allows us to check: >>> dt.add_sentence('No person dances', consistchk=True) Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']: s0-r0: exists z1.(student(z1) & dance(z1)) s1-r0: all z1.(student(z1) -> person(z1)) s2-r0: -exists z1.(person(z1) & dance(z1)) >>> dt.readings() s0 readings: s0-r0: exists z1.(student(z1) & dance(z1)) s1 readings: s1-r0: all z1.(student(z1) -> person(z1)) s2 readings: s2-r0: -exists z1.(person(z1) & dance(z1)) So let's retract the inconsistent sentence: >>> dt.retract_sentence('No person dances', verbose=True) # doctest: +NORMALIZE_WHITESPACE Current sentences are s0: A student dances s1: Every student is a person We can now verify that result is consistent. >>> dt.models() -------------------------------------------------------------------------------- Model for Discourse Thread d0 -------------------------------------------------------------------------------- % number = 1 % seconds = 0 % Interpretation of size 2 c1 = 0. dance(0). - dance(1). person(0). - person(1). student(0). - student(1). Consistent discourse: d0 ['s0-r0', 's1-r0']: s0-r0: exists z1.(student(z1) & dance(z1)) s1-r0: all z1.(student(z1) -> person(z1)) Checking Informativity ====================== Let's assume that we are still trying to extend the discourse *A student dances.* *Every student is a person.* We add a new sentence, but this time, we check whether it is informative with respect to what has gone before. >>> dt.add_sentence('A person dances', informchk=True) Sentence 'A person dances' under reading 'exists x.(person(x) & dance(x))': Not informative relative to thread 'd0' In fact, we are just checking whether the new sentence is entailed by the preceding discourse. >>> dt.models() -------------------------------------------------------------------------------- Model for Discourse Thread d0 -------------------------------------------------------------------------------- % number = 1 % seconds = 0 % Interpretation of size 2 c1 = 0. c2 = 0. dance(0). - dance(1). person(0). - person(1). student(0). - student(1). Consistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0']: s0-r0: exists z1.(student(z1) & dance(z1)) s1-r0: all z1.(student(z1) -> person(z1)) s2-r0: exists z1.(person(z1) & dance(z1)) Adding Background Knowledge =========================== Let's build a new discourse, and look at the readings of the component sentences: >>> dt = DiscourseTester(['Vincent is a boxer', 'Fido is a boxer', 'Vincent is married', 'Fido barks']) >>> dt.readings() s0 readings: s0-r0: boxer(Vincent) s0-r1: boxerdog(Vincent) s1 readings: s1-r0: boxer(Fido) s1-r1: boxerdog(Fido) s2 readings: s2-r0: married(Vincent) s3 readings: s3-r0: bark(Fido) This gives us a lot of threads: >>> dt.readings(threaded=True) # doctest: +NORMALIZE_WHITESPACE d0: ['s0-r0', 's1-r0', 's2-r0', 's3-r0'] d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0'] d2: ['s0-r1', 's1-r0', 's2-r0', 's3-r0'] d3: ['s0-r1', 's1-r1', 's2-r0', 's3-r0'] We can eliminate some of the readings, and hence some of the threads, by adding background information. >>> import nltk.data >>> bg = nltk.data.load('grammars/book_grammars/background.fol') >>> dt.add_background(bg) >>> dt.background() all x.(boxerdog(x) -> dog(x)) all x.(boxer(x) -> person(x)) all x.-(dog(x) & person(x)) all x.(married(x) <-> exists y.marry(x,y)) all x.(bark(x) -> dog(x)) all x y.(marry(x,y) -> (person(x) & person(y))) -(Vincent = Mia) -(Vincent = Fido) -(Mia = Fido) The background information allows us to reject three of the threads as inconsistent. To see what remains, use the ``filter=True`` parameter on ``readings()``. >>> dt.readings(filter=True) # doctest: +NORMALIZE_WHITESPACE d1: ['s0-r0', 's1-r1', 's2-r0', 's3-r0'] The ``models()`` method gives us more information about the surviving thread. >>> dt.models() -------------------------------------------------------------------------------- Model for Discourse Thread d0 -------------------------------------------------------------------------------- No model found! -------------------------------------------------------------------------------- Model for Discourse Thread d1 -------------------------------------------------------------------------------- % number = 1 % seconds = 0 % Interpretation of size 3 Fido = 0. Mia = 1. Vincent = 2. f1(0) = 0. f1(1) = 0. f1(2) = 2. bark(0). - bark(1). - bark(2). - boxer(0). - boxer(1). boxer(2). boxerdog(0). - boxerdog(1). - boxerdog(2). dog(0). - dog(1). - dog(2). - married(0). - married(1). married(2). - person(0). - person(1). person(2). - marry(0,0). - marry(0,1). - marry(0,2). - marry(1,0). - marry(1,1). - marry(1,2). - marry(2,0). - marry(2,1). marry(2,2). -------------------------------------------------------------------------------- Model for Discourse Thread d2 -------------------------------------------------------------------------------- No model found! -------------------------------------------------------------------------------- Model for Discourse Thread d3 -------------------------------------------------------------------------------- No model found! Inconsistent discourse: d0 ['s0-r0', 's1-r0', 's2-r0', 's3-r0']: s0-r0: boxer(Vincent) s1-r0: boxer(Fido) s2-r0: married(Vincent) s3-r0: bark(Fido) Consistent discourse: d1 ['s0-r0', 's1-r1', 's2-r0', 's3-r0']: s0-r0: boxer(Vincent) s1-r1: boxerdog(Fido) s2-r0: married(Vincent) s3-r0: bark(Fido) Inconsistent discourse: d2 ['s0-r1', 's1-r0', 's2-r0', 's3-r0']: s0-r1: boxerdog(Vincent) s1-r0: boxer(Fido) s2-r0: married(Vincent) s3-r0: bark(Fido) Inconsistent discourse: d3 ['s0-r1', 's1-r1', 's2-r0', 's3-r0']: s0-r1: boxerdog(Vincent) s1-r1: boxerdog(Fido) s2-r0: married(Vincent) s3-r0: bark(Fido) .. This will not be visible in the html output: create a tempdir to play in. >>> import tempfile, os >>> tempdir = tempfile.mkdtemp() >>> old_dir = os.path.abspath('.') >>> os.chdir(tempdir) In order to play around with your own version of background knowledge, you might want to start off with a local copy of ``background.fol``: >>> nltk.data.retrieve('grammars/book_grammars/background.fol') Retrieving 'nltk:grammars/book_grammars/background.fol', saving to 'background.fol' After you have modified the file, the ``load_fol()`` function will parse the strings in the file into expressions of ``nltk.sem.logic``. >>> from nltk.inference.discourse import load_fol >>> mybg = load_fol(open('background.fol').read()) The result can be loaded as an argument of ``add_background()`` in the manner shown earlier. .. This will not be visible in the html output: clean up the tempdir. >>> os.chdir(old_dir) >>> for f in os.listdir(tempdir): ... os.remove(os.path.join(tempdir, f)) >>> os.rmdir(tempdir) >>> nltk.data.clear_cache() Regression Testing from book ============================ >>> logic._counter._value = 0 >>> from nltk.tag import RegexpTagger >>> tagger = RegexpTagger( ... [('^(chases|runs)$', 'VB'), ... ('^(a)$', 'ex_quant'), ... ('^(every)$', 'univ_quant'), ... ('^(dog|boy)$', 'NN'), ... ('^(He)$', 'PRP') ... ]) >>> rc = DrtGlueReadingCommand(depparser=MaltParser(tagger=tagger)) >>> dt = DiscourseTester(map(str.split, ['Every dog chases a boy', 'He runs']), rc) >>> dt.readings() s0 readings: s0-r0: ([z2],[boy(z2), (([z5],[dog(z5)]) -> ([],[chases(z5,z2)]))]) s0-r1: ([],[(([z1],[dog(z1)]) -> ([z2],[boy(z2), chases(z1,z2)]))]) s1 readings: s1-r0: ([z1],[PRO(z1), runs(z1)]) >>> dt.readings(show_thread_readings=True) d0: ['s0-r0', 's1-r0'] : ([z1,z2],[boy(z1), (([z3],[dog(z3)]) -> ([],[chases(z3,z1)])), (z2 = z1), runs(z2)]) d1: ['s0-r1', 's1-r0'] : INVALID: AnaphoraResolutionException >>> dt.readings(filter=True, show_thread_readings=True) d0: ['s0-r0', 's1-r0'] : ([z1,z3],[boy(z1), (([z2],[dog(z2)]) -> ([],[chases(z2,z1)])), (z3 = z1), runs(z3)]) >>> logic._counter._value = 0 >>> from nltk.parse import FeatureEarleyChartParser >>> from nltk.sem.drt import DrtParser >>> grammar = nltk.data.load('grammars/book_grammars/drt.fcfg', logic_parser=DrtParser()) >>> parser = FeatureEarleyChartParser(grammar, trace=0) >>> trees = parser.parse('Angus owns a dog'.split()) >>> print(list(trees)[0].label()['SEM'].simplify().normalize()) ([z1,z2],[Angus(z1), dog(z2), own(z1,z2)]) nltk-3.1/nltk/test/discourse_fixt.py0000644000076500000240000000073512574600335017421 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import # FIXME: the entire discourse.doctest is skipped if Prover9/Mace4 is # not installed, but there are pure-python parts that don't need Prover9. def setup_module(module): from nose import SkipTest from nltk.inference.mace import Mace try: m = Mace() m._find_binary('mace4') except LookupError: raise SkipTest("Mace4/Prover9 is not available so discourse.doctest is skipped") nltk-3.1/nltk/test/doctest_nose_plugin.py0000644000076500000240000001357012574600335020437 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import print_function from nose.suite import ContextList import re import sys import os import codecs import doctest from nose.plugins.base import Plugin from nose.util import tolist, anyp from nose.plugins.doctests import Doctest, log, DocFileCase ALLOW_UNICODE = doctest.register_optionflag('ALLOW_UNICODE') class _UnicodeOutputChecker(doctest.OutputChecker): _literal_re = re.compile(r"(\W|^)[uU]([rR]?[\'\"])", re.UNICODE) def _remove_u_prefixes(self, txt): return re.sub(self._literal_re, r'\1\2', txt) def check_output(self, want, got, optionflags): res = doctest.OutputChecker.check_output(self, want, got, optionflags) if res: return True if not (optionflags & ALLOW_UNICODE): return False # ALLOW_UNICODE is active and want != got cleaned_want = self._remove_u_prefixes(want) cleaned_got = self._remove_u_prefixes(got) res = doctest.OutputChecker.check_output(self, cleaned_want, cleaned_got, optionflags) return res _checker = _UnicodeOutputChecker() class DoctestPluginHelper(object): """ This mixin adds print_function future import to all test cases. It also adds support for: '#doctest +ALLOW_UNICODE' option that makes DocTestCase think u'foo' == 'foo'. '#doctest doctestencoding=utf-8' option that changes the encoding of doctest files """ OPTION_BY_NAME = ('doctestencoding',) def loadTestsFromFileUnicode(self, filename): if self.extension and anyp(filename.endswith, self.extension): name = os.path.basename(filename) dh = codecs.open(filename, 'r', self.options.get('doctestencoding')) try: doc = dh.read() finally: dh.close() fixture_context = None globs = {'__file__': filename} if self.fixtures: base, ext = os.path.splitext(name) dirname = os.path.dirname(filename) sys.path.append(dirname) fixt_mod = base + self.fixtures try: fixture_context = __import__( fixt_mod, globals(), locals(), ["nop"]) except ImportError as e: log.debug( "Could not import %s: %s (%s)", fixt_mod, e, sys.path) log.debug("Fixture module %s resolved to %s", fixt_mod, fixture_context) if hasattr(fixture_context, 'globs'): globs = fixture_context.globs(globs) parser = doctest.DocTestParser() test = parser.get_doctest( doc, globs=globs, name=name, filename=filename, lineno=0) if test.examples: case = DocFileCase( test, optionflags=self.optionflags, setUp=getattr(fixture_context, 'setup_test', None), tearDown=getattr(fixture_context, 'teardown_test', None), result_var=self.doctest_result_var) if fixture_context: yield ContextList((case,), context=fixture_context) else: yield case else: yield False # no tests to load def loadTestsFromFile(self, filename): cases = self.loadTestsFromFileUnicode(filename) for case in cases: if isinstance(case, ContextList): yield ContextList([self._patchTestCase(c) for c in case], case.context) else: yield self._patchTestCase(case) def loadTestsFromModule(self, module): """Load doctests from the module. """ for suite in super(DoctestPluginHelper, self).loadTestsFromModule(module): cases = [self._patchTestCase(case) for case in suite._get_tests()] yield self.suiteClass(cases, context=module, can_split=False) def _patchTestCase(self, case): if case: case._dt_test.globs['print_function'] = print_function case._dt_checker = _checker return case def configure(self, options, config): # it is overriden in order to fix doctest options discovery Plugin.configure(self, options, config) self.doctest_result_var = options.doctest_result_var self.doctest_tests = options.doctest_tests self.extension = tolist(options.doctestExtension) self.fixtures = options.doctestFixtures self.finder = doctest.DocTestFinder() #super(DoctestPluginHelper, self).configure(options, config) self.optionflags = 0 self.options = {} if options.doctestOptions: stroptions = ",".join(options.doctestOptions).split(',') for stroption in stroptions: try: if stroption.startswith('+'): self.optionflags |= doctest.OPTIONFLAGS_BY_NAME[stroption[1:]] continue elif stroption.startswith('-'): self.optionflags &= ~doctest.OPTIONFLAGS_BY_NAME[stroption[1:]] continue try: key,value=stroption.split('=') except ValueError: pass else: if not key in self.OPTION_BY_NAME: raise ValueError() self.options[key]=value continue except (AttributeError, ValueError, KeyError): raise ValueError("Unknown doctest option {}".format(stroption)) else: raise ValueError("Doctest option is not a flag or a key/value pair: {} ".format(stroption)) class DoctestFix(DoctestPluginHelper, Doctest): pass nltk-3.1/nltk/test/drt.doctest0000644000076500000240000004615312607224144016176 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ================================ Discourse Representation Theory ================================ >>> from nltk.sem import logic >>> from nltk.inference import TableauProver Overview ======== A DRS can be created with the ``DRS()`` constructor. This takes two arguments: a list of discourse referents and list of conditions. . >>> from nltk.sem.drt import * >>> dexpr = DrtExpression.fromstring >>> man_x = dexpr('man(x)') >>> walk_x = dexpr('walk(x)') >>> x = dexpr('x') >>> print(DRS([x], [man_x, walk_x])) ([x],[man(x), walk(x)]) The ``parse()`` method can also be applied directly to DRS expressions, which allows them to be specified more easily. >>> drs1 = dexpr('([x],[man(x),walk(x)])') >>> print(drs1) ([x],[man(x), walk(x)]) DRSs can be *merged* using the ``+`` operator. >>> drs2 = dexpr('([y],[woman(y),stop(y)])') >>> drs3 = drs1 + drs2 >>> print(drs3) (([x],[man(x), walk(x)]) + ([y],[woman(y), stop(y)])) >>> print(drs3.simplify()) ([x,y],[man(x), walk(x), woman(y), stop(y)]) We can embed DRSs as components of an ``implies`` condition. >>> s = '([], [(%s -> %s)])' % (drs1, drs2) >>> print(dexpr(s)) ([],[(([x],[man(x), walk(x)]) -> ([y],[woman(y), stop(y)]))]) The ``fol()`` method converts DRSs into FOL formulae. >>> print(dexpr(r'([x],[man(x), walks(x)])').fol()) exists x.(man(x) & walks(x)) >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol()) all x.(man(x) -> walks(x)) In order to visualize a DRS, the ``pretty_format()`` method can be used. >>> print(drs3.pretty_format()) _________ __________ | x | | y | (|---------| + |----------|) | man(x) | | woman(y) | | walk(x) | | stop(y) | |_________| |__________| Parse to semantics ------------------ .. >>> logic._counter._value = 0 DRSs can be used for building compositional semantics in a feature based grammar. To specify that we want to use DRSs, the appropriate logic parser needs be passed as a parameter to ``load_earley()`` >>> from nltk.parse import load_parser >>> from nltk.sem.drt import DrtParser >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, logic_parser=DrtParser()) >>> for tree in parser.parse('a dog barks'.split()): ... print(tree.label()['SEM'].simplify()) ... ([x],[dog(x), bark(x)]) Alternatively, a ``FeatStructReader`` can be passed with the ``logic_parser`` set on it >>> from nltk.featstruct import FeatStructReader >>> from nltk.grammar import FeatStructNonterminal >>> parser = load_parser('grammars/book_grammars/drt.fcfg', trace=0, fstruct_reader=FeatStructReader(fdict_class=FeatStructNonterminal, logic_parser=DrtParser())) >>> for tree in parser.parse('every girl chases a dog'.split()): ... print(tree.label()['SEM'].simplify().normalize()) ... ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chase(z1,z2)]))]) Unit Tests ========== Parser ------ >>> print(dexpr(r'([x,y],[sees(x,y)])')) ([x,y],[sees(x,y)]) >>> print(dexpr(r'([x],[man(x), walks(x)])')) ([x],[man(x), walks(x)]) >>> print(dexpr(r'\x.([],[man(x), walks(x)])')) \x.([],[man(x), walks(x)]) >>> print(dexpr(r'\x.\y.([],[sees(x,y)])')) \x y.([],[sees(x,y)]) >>> print(dexpr(r'([x,y],[(x = y)])')) ([x,y],[(x = y)]) >>> print(dexpr(r'([x,y],[(x != y)])')) ([x,y],[-(x = y)]) >>> print(dexpr(r'\x.([],[walks(x)])(john)')) (\x.([],[walks(x)]))(john) >>> print(dexpr(r'\R.\x.([],[big(x,R)])(\y.([],[mouse(y)]))')) (\R x.([],[big(x,R)]))(\y.([],[mouse(y)])) >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))')) (([x],[walks(x)]) + ([y],[runs(y)])) >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))')) (([x,y],[walks(x), jumps(y)]) + ([z],[twos(z)]) + ([w],[runs(w)])) >>> print(dexpr(r'((([],[walks(x)]) + ([],[twos(x)])) + ([],[runs(x)]))')) (([],[walks(x)]) + ([],[twos(x)]) + ([],[runs(x)])) >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)])) + (([],[threes(x)]) + ([],[fours(x)])))')) (([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])) >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))')) (([],[walks(x)]) -> ([],[runs(x)])) >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])')) ([x],[PRO(x), sees(John,x)]) >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])')) ([x],[man(x), -([],[walks(x)])]) >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])')) ([],[(([x],[man(x)]) -> ([],[walks(x)]))]) >>> print(dexpr(r'DRS([x],[walk(x)])')) ([x],[walk(x)]) >>> print(dexpr(r'DRS([x][walk(x)])')) ([x],[walk(x)]) >>> print(dexpr(r'([x][walk(x)])')) ([x],[walk(x)]) ``simplify()`` -------------- >>> print(dexpr(r'\x.([],[man(x), walks(x)])(john)').simplify()) ([],[man(john), walks(john)]) >>> print(dexpr(r'\x.\y.([z],[dog(z),sees(x,y)])(john)(mary)').simplify()) ([z],[dog(z), sees(john,mary)]) >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').simplify()) \x.([],[big(x,\y.([],[mouse(y)]))]) >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').simplify()) ([x,y],[walks(x), runs(y)]) >>> print(dexpr(r'(([x,y],[walks(x), jumps(y)]) + (([z],[twos(z)]) + ([w],[runs(w)])))').simplify()) ([w,x,y,z],[walks(x), jumps(y), twos(z), runs(w)]) >>> print(dexpr(r'((([],[walks(x)]) + ([],[runs(x)]) + ([],[threes(x)]) + ([],[fours(x)])))').simplify()) ([],[walks(x), runs(x), threes(x), fours(x)]) >>> dexpr(r'([x],[man(x)])+([x],[walks(x)])').simplify() == \ ... dexpr(r'([x,z1],[man(x), walks(z1)])') True >>> dexpr(r'([y],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)]))])+([x],[run(x)])').simplify() == \ ... dexpr(r'([y,z1],[boy(y), (([x],[dog(x)]) -> ([],[chase(x,y)])), run(z1)])') True >>> dexpr(r'\Q.(([x],[john(x),walks(x)]) + Q)(([x],[PRO(x),leaves(x)]))').simplify() == \ ... dexpr(r'([x,z1],[john(x), walks(x), PRO(z1), leaves(z1)])') True >>> logic._counter._value = 0 >>> print(dexpr('([],[(([x],[dog(x)]) -> ([e,y],[boy(y), chase(e), subj(e,x), obj(e,y)]))])+([e,x],[PRO(x), run(e), subj(e,x)])').simplify().normalize().normalize()) ([e02,z5],[(([z3],[dog(z3)]) -> ([e01,z4],[boy(z4), chase(e01), subj(e01,z3), obj(e01,z4)])), PRO(z5), run(e02), subj(e02,z5)]) ``fol()`` ----------- >>> print(dexpr(r'([x,y],[sees(x,y)])').fol()) exists x y.sees(x,y) >>> print(dexpr(r'([x],[man(x), walks(x)])').fol()) exists x.(man(x) & walks(x)) >>> print(dexpr(r'\x.([],[man(x), walks(x)])').fol()) \x.(man(x) & walks(x)) >>> print(dexpr(r'\x y.([],[sees(x,y)])').fol()) \x y.sees(x,y) >>> print(dexpr(r'\x.([],[walks(x)])(john)').fol()) \x.walks(x)(john) >>> print(dexpr(r'\R x.([],[big(x,R)])(\y.([],[mouse(y)]))').fol()) (\R x.big(x,R))(\y.mouse(y)) >>> print(dexpr(r'(([x],[walks(x)]) + ([y],[runs(y)]))').fol()) (exists x.walks(x) & exists y.runs(y)) >>> print(dexpr(r'(([],[walks(x)]) -> ([],[runs(x)]))').fol()) (walks(x) -> runs(x)) >>> print(dexpr(r'([x],[PRO(x), sees(John,x)])').fol()) exists x.(PRO(x) & sees(John,x)) >>> print(dexpr(r'([x],[man(x), -([],[walks(x)])])').fol()) exists x.(man(x) & -walks(x)) >>> print(dexpr(r'([],[(([x],[man(x)]) -> ([],[walks(x)]))])').fol()) all x.(man(x) -> walks(x)) >>> print(dexpr(r'([x],[man(x) | walks(x)])').fol()) exists x.(man(x) | walks(x)) >>> print(dexpr(r'P(x) + ([x],[walks(x)])').fol()) (P(x) & exists x.walks(x)) ``resolve_anaphora()`` ---------------------- >>> from nltk.sem.drt import AnaphoraResolutionException >>> print(resolve_anaphora(dexpr(r'([x,y,z],[dog(x), cat(y), walks(z), PRO(z)])'))) ([x,y,z],[dog(x), cat(y), walks(z), (z = [x,y])]) >>> print(resolve_anaphora(dexpr(r'([],[(([x],[dog(x)]) -> ([y],[walks(y), PRO(y)]))])'))) ([],[(([x],[dog(x)]) -> ([y],[walks(y), (y = x)]))]) >>> print(resolve_anaphora(dexpr(r'(([x,y],[]) + ([],[PRO(x)]))')).simplify()) ([x,y],[(x = y)]) >>> try: print(resolve_anaphora(dexpr(r'([x],[walks(x), PRO(x)])'))) ... except AnaphoraResolutionException as e: print(e) Variable 'x' does not resolve to anything. >>> print(resolve_anaphora(dexpr('([e01,z6,z7],[boy(z6), PRO(z7), run(e01), subj(e01,z7)])'))) ([e01,z6,z7],[boy(z6), (z7 = z6), run(e01), subj(e01,z7)]) ``equiv()``: ---------------- >>> a = dexpr(r'([x],[man(x), walks(x)])') >>> b = dexpr(r'([x],[walks(x), man(x)])') >>> print(a.equiv(b, TableauProver())) True ``replace()``: -------------- >>> a = dexpr(r'a') >>> w = dexpr(r'w') >>> x = dexpr(r'x') >>> y = dexpr(r'y') >>> z = dexpr(r'z') replace bound ------------- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, False)) ([x],[give(x,y,z)]) >>> print(dexpr(r'([x],[give(x,y,z)])').replace(x.variable, a, True)) ([a],[give(a,y,z)]) replace unbound --------------- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, False)) ([x],[give(x,a,z)]) >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, a, True)) ([x],[give(x,a,z)]) replace unbound with bound -------------------------- >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, False) == \ ... dexpr('([z1],[give(z1,x,z)])') True >>> dexpr(r'([x],[give(x,y,z)])').replace(y.variable, x, True) == \ ... dexpr('([z1],[give(z1,x,z)])') True replace unbound with unbound ---------------------------- >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, False)) ([x],[give(x,z,z)]) >>> print(dexpr(r'([x],[give(x,y,z)])').replace(y.variable, z, True)) ([x],[give(x,z,z)]) replace unbound --------------- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False)) (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True)) (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) replace bound ------------- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, False)) (([x],[P(x,y,z)]) + ([y],[Q(x,y,z)])) >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(x.variable, a, True)) (([a],[P(a,y,z)]) + ([y],[Q(a,y,z)])) replace unbound with unbound ---------------------------- >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, False)) (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) >>> print(dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,z)])').replace(z.variable, a, True)) (([x],[P(x,y,a)]) + ([y],[Q(x,y,a)])) replace unbound with bound on same side --------------------------------------- >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, False) == \ ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))') True >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(z.variable, x, True) == \ ... dexpr(r'(([z1],[P(z1,y,x)]) + ([y],[Q(z1,y,w)]))') True replace unbound with bound on other side ---------------------------------------- >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, False) == \ ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))') True >>> dexpr(r'([x],[P(x,y,z)])+([y],[Q(x,y,w)])').replace(w.variable, x, True) == \ ... dexpr(r'(([z1],[P(z1,y,z)]) + ([y],[Q(z1,y,x)]))') True replace unbound with double bound --------------------------------- >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, False) == \ ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))') True >>> dexpr(r'([x],[P(x,y,z)])+([x],[Q(x,y,w)])').replace(z.variable, x, True) == \ ... dexpr(r'(([z1],[P(z1,y,x)]) + ([z1],[Q(z1,y,w)]))') True regression tests ---------------- >>> d = dexpr('([x],[A(c), ([y],[B(x,y,z,a)])->([z],[C(x,y,z,a)])])') >>> print(d) ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.pretty_format()) ____________________________________ | x | |------------------------------------| | A(c) | | ____________ ____________ | | | y | | z | | | (|------------| -> |------------|) | | | B(x,y,z,a) | | C(x,y,z,a) | | | |____________| |____________| | |____________________________________| >>> print(str(d)) ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.fol()) exists x.(A(c) & all y.(B(x,y,z,a) -> exists z.C(x,y,z,a))) >>> print(d.replace(Variable('a'), DrtVariableExpression(Variable('r')))) ([x],[A(c), (([y],[B(x,y,z,r)]) -> ([z],[C(x,y,z,r)]))]) >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')))) ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')))) ([x],[A(c), (([y],[B(x,y,z,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')))) ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([z],[C(x,y,z,a)]))]) >>> print(d.replace(Variable('x'), DrtVariableExpression(Variable('r')), True)) ([r],[A(c), (([y],[B(r,y,z,a)]) -> ([z],[C(r,y,z,a)]))]) >>> print(d.replace(Variable('y'), DrtVariableExpression(Variable('r')), True)) ([x],[A(c), (([r],[B(x,r,z,a)]) -> ([z],[C(x,r,z,a)]))]) >>> print(d.replace(Variable('z'), DrtVariableExpression(Variable('r')), True)) ([x],[A(c), (([y],[B(x,y,r,a)]) -> ([r],[C(x,y,r,a)]))]) >>> print(d == dexpr('([l],[A(c), ([m],[B(l,m,z,a)])->([n],[C(l,m,n,a)])])')) True >>> d = dexpr('([],[([x,y],[B(x,y,h), ([a,b],[dee(x,a,g)])])->([z,w],[cee(x,y,f), ([c,d],[E(x,c,d,e)])])])') >>> sorted(d.free()) [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')] >>> sorted(d.variables()) [Variable('B'), Variable('E'), Variable('e'), Variable('f'), Variable('g'), Variable('h')] >>> sorted(d.get_refs(True)) [Variable('a'), Variable('b'), Variable('c'), Variable('d'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] >>> sorted(d.conds[0].get_refs(False)) [Variable('x'), Variable('y')] >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])->([],[C(x,y)]), ([x,y],[D(x,y)])->([],[E(x,y)]), ([],[F(x,y)])->([x,y],[G(x,y)])])').eliminate_equality()) ([x],[A(x,x), (([],[B(x,x)]) -> ([],[C(x,x)])), (([x,y],[D(x,y)]) -> ([],[E(x,y)])), (([],[F(x,x)]) -> ([x,y],[G(x,y)]))]) >>> print(dexpr('([x,y],[A(x,y), (x=y)]) -> ([],[B(x,y)])').eliminate_equality()) (([x],[A(x,x)]) -> ([],[B(x,x)])) >>> print(dexpr('([x,y],[A(x,y)]) -> ([],[B(x,y), (x=y)])').eliminate_equality()) (([x,y],[A(x,y)]) -> ([],[B(x,x)])) >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)])])').eliminate_equality()) ([x],[A(x,x), ([],[B(x,x)])]) >>> print(dexpr('([x,y],[A(x,y), ([],[B(x,y), (x=y)])])').eliminate_equality()) ([x,y],[A(x,y), ([],[B(x,x)])]) >>> print(dexpr('([z8 z9 z10],[A(z8), z8=z10, z9=z10, B(z9), C(z10), D(z10)])').eliminate_equality()) ([z9],[A(z9), B(z9), C(z9), D(z9)]) >>> print(dexpr('([x,y],[A(x,y), (x=y), ([],[B(x,y)]), ([x,y],[C(x,y)])])').eliminate_equality()) ([x],[A(x,x), ([],[B(x,x)]), ([x,y],[C(x,y)])]) >>> print(dexpr('([x,y],[A(x,y)]) + ([],[B(x,y), (x=y)]) + ([],[C(x,y)])').eliminate_equality()) ([x],[A(x,x), B(x,x), C(x,x)]) >>> print(dexpr('([x,y],[B(x,y)])+([x,y],[C(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) (([x,y],[B(x,y)]) + ([x,y],[C(x,y)])) >>> print(dexpr('(([x,y],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) (([x,y],[B(x,y)]) + ([],[C(x,y)]) + ([],[D(x,y)])) >>> print(dexpr('(([],[B(x,y)])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x')))) (([],[B(x,x)]) + ([],[C(x,x)]) + ([],[D(x,x)])) >>> print(dexpr('(([],[B(x,y), ([x,y],[A(x,y)])])+([],[C(x,y)]))+([],[D(x,y)])').replace(Variable('y'), DrtVariableExpression(Variable('x'))).normalize()) (([],[B(z3,z1), ([z2,z3],[A(z3,z2)])]) + ([],[C(z3,z1)]) + ([],[D(z3,z1)])) Parse errors ============ >>> def parse_error(drtstring): ... try: dexpr(drtstring) ... except logic.LogicalExpressionException as e: print(e) >>> parse_error(r'') End of input found. Expression expected. ^ >>> parse_error(r'(') End of input found. Expression expected. ( ^ >>> parse_error(r'()') Unexpected token: ')'. Expression expected. () ^ >>> parse_error(r'([') End of input found. Expected token ']'. ([ ^ >>> parse_error(r'([,') ',' is an illegal variable name. Constants may not be quantified. ([, ^ >>> parse_error(r'([x,') End of input found. Variable expected. ([x, ^ >>> parse_error(r'([]') End of input found. Expected token '['. ([] ^ >>> parse_error(r'([][') End of input found. Expected token ']'. ([][ ^ >>> parse_error(r'([][,') Unexpected token: ','. Expression expected. ([][, ^ >>> parse_error(r'([][]') End of input found. Expected token ')'. ([][] ^ >>> parse_error(r'([x][man(x)]) |') End of input found. Expression expected. ([x][man(x)]) | ^ Pretty Printing =============== >>> dexpr(r"([],[])").pretty_print() __ | | |--| |__| >>> dexpr(r"([],[([x],[big(x), dog(x)]) -> ([],[bark(x)]) -([x],[walk(x)])])").pretty_print() _____________________________ | | |-----------------------------| | ________ _________ | | | x | | | | | (|--------| -> |---------|) | | | big(x) | | bark(x) | | | | dog(x) | |_________| | | |________| | | _________ | | | x | | | __ |---------| | | | | walk(x) | | | |_________| | |_____________________________| >>> dexpr(r"([x,y],[x=y]) + ([z],[dog(z), walk(z)])").pretty_print() _________ _________ | x y | | z | (|---------| + |---------|) | (x = y) | | dog(z) | |_________| | walk(z) | |_________| >>> dexpr(r"([],[([x],[]) | ([y],[]) | ([z],[dog(z), walk(z)])])").pretty_print() _______________________________ | | |-------------------------------| | ___ ___ _________ | | | x | | y | | z | | | (|---| | |---| | |---------|) | | |___| |___| | dog(z) | | | | walk(z) | | | |_________| | |_______________________________| >>> dexpr(r"\P.\Q.(([x],[]) + P(x) + Q(x))(\x.([],[dog(x)]))").pretty_print() ___ ________ \ | x | \ | | /\ P Q.(|---| + P(x) + Q(x))( /\ x.|--------|) |___| | dog(x) | |________| nltk-3.1/nltk/test/featgram.doctest0000644000076500000240000006734712607224144017203 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ========================= Feature Grammar Parsing ========================= .. include:: ../../../nltk_book/definitions.rst Grammars can be parsed from strings. >>> from __future__ import print_function >>> import nltk >>> from nltk import grammar, parse >>> g = """ ... % start DP ... DP[AGR=?a] -> D[AGR=?a] N[AGR=?a] ... D[AGR=[NUM='sg', PERS=3]] -> 'this' | 'that' ... D[AGR=[NUM='pl', PERS=3]] -> 'these' | 'those' ... D[AGR=[NUM='pl', PERS=1]] -> 'we' ... D[AGR=[PERS=2]] -> 'you' ... N[AGR=[NUM='sg', GND='m']] -> 'boy' ... N[AGR=[NUM='pl', GND='m']] -> 'boys' ... N[AGR=[NUM='sg', GND='f']] -> 'girl' ... N[AGR=[NUM='pl', GND='f']] -> 'girls' ... N[AGR=[NUM='sg']] -> 'student' ... N[AGR=[NUM='pl']] -> 'students' ... """ >>> grammar = grammar.FeatureGrammar.fromstring(g) >>> tokens = 'these girls'.split() >>> parser = parse.FeatureEarleyChartParser(grammar) >>> trees = parser.parse(tokens) >>> for tree in trees: print(tree) (DP[AGR=[GND='f', NUM='pl', PERS=3]] (D[AGR=[NUM='pl', PERS=3]] these) (N[AGR=[GND='f', NUM='pl']] girls)) In general, when we are trying to develop even a very small grammar, it is convenient to put the rules in a file where they can be edited, tested and revised. Let's assume that we have saved feat0cfg_ as a file named ``'feat0.fcfg'`` and placed it in the NLTK ``data`` directory. We can inspect it as follows: .. _feat0cfg: http://nltk.svn.sourceforge.net/svnroot/nltk/trunk/nltk/data/grammars/feat0.fcfg >>> nltk.data.show_cfg('grammars/book_grammars/feat0.fcfg') % start S # ################### # Grammar Productions # ################### # S expansion productions S -> NP[NUM=?n] VP[NUM=?n] # NP expansion productions NP[NUM=?n] -> N[NUM=?n] NP[NUM=?n] -> PropN[NUM=?n] NP[NUM=?n] -> Det[NUM=?n] N[NUM=?n] NP[NUM=pl] -> N[NUM=pl] # VP expansion productions VP[TENSE=?t, NUM=?n] -> IV[TENSE=?t, NUM=?n] VP[TENSE=?t, NUM=?n] -> TV[TENSE=?t, NUM=?n] NP # ################### # Lexical Productions # ################### Det[NUM=sg] -> 'this' | 'every' Det[NUM=pl] -> 'these' | 'all' Det -> 'the' | 'some' | 'several' PropN[NUM=sg]-> 'Kim' | 'Jody' N[NUM=sg] -> 'dog' | 'girl' | 'car' | 'child' N[NUM=pl] -> 'dogs' | 'girls' | 'cars' | 'children' IV[TENSE=pres, NUM=sg] -> 'disappears' | 'walks' TV[TENSE=pres, NUM=sg] -> 'sees' | 'likes' IV[TENSE=pres, NUM=pl] -> 'disappear' | 'walk' TV[TENSE=pres, NUM=pl] -> 'see' | 'like' IV[TENSE=past] -> 'disappeared' | 'walked' TV[TENSE=past] -> 'saw' | 'liked' Assuming we have saved feat0cfg_ as a file named ``'feat0.fcfg'``, the function ``parse.load_parser`` allows us to read the grammar into NLTK, ready for use in parsing. >>> cp = parse.load_parser('grammars/book_grammars/feat0.fcfg', trace=1) >>> sent = 'Kim likes children' >>> tokens = sent.split() >>> tokens ['Kim', 'likes', 'children'] >>> trees = cp.parse(tokens) |.Kim .like.chil.| |[----] . .| [0:1] 'Kim' |. [----] .| [1:2] 'likes' |. . [----]| [2:3] 'children' |[----] . .| [0:1] PropN[NUM='sg'] -> 'Kim' * |[----] . .| [0:1] NP[NUM='sg'] -> PropN[NUM='sg'] * |[----> . .| [0:1] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'sg'} |. [----] .| [1:2] TV[NUM='sg', TENSE='pres'] -> 'likes' * |. [----> .| [1:2] VP[NUM=?n, TENSE=?t] -> TV[NUM=?n, TENSE=?t] * NP[] {?n: 'sg', ?t: 'pres'} |. . [----]| [2:3] N[NUM='pl'] -> 'children' * |. . [----]| [2:3] NP[NUM='pl'] -> N[NUM='pl'] * |. . [---->| [2:3] S[] -> NP[NUM=?n] * VP[NUM=?n] {?n: 'pl'} |. [---------]| [1:3] VP[NUM='sg', TENSE='pres'] -> TV[NUM='sg', TENSE='pres'] NP[] * |[==============]| [0:3] S[] -> NP[NUM='sg'] VP[NUM='sg'] * >>> for tree in trees: print(tree) (S[] (NP[NUM='sg'] (PropN[NUM='sg'] Kim)) (VP[NUM='sg', TENSE='pres'] (TV[NUM='sg', TENSE='pres'] likes) (NP[NUM='pl'] (N[NUM='pl'] children)))) The parser works directly with the underspecified productions given by the grammar. That is, the Predictor rule does not attempt to compile out all admissible feature combinations before trying to expand the non-terminals on the left hand side of a production. However, when the Scanner matches an input word against a lexical production that has been predicted, the new edge will typically contain fully specified features; e.g., the edge [PropN[`num`:feat: = `sg`:fval:] |rarr| 'Kim', (0, 1)]. Recall from Chapter 8 that the Fundamental (or Completer) Rule in standard CFGs is used to combine an incomplete edge that's expecting a nonterminal *B* with a following, complete edge whose left hand side matches *B*. In our current setting, rather than checking for a complete match, we test whether the expected category *B* will `unify`:dt: with the left hand side *B'* of a following complete edge. We will explain in more detail in Section 9.2 how unification works; for the moment, it is enough to know that as a result of unification, any variable values of features in *B* will be instantiated by constant values in the corresponding feature structure in *B'*, and these instantiated values will be used in the new edge added by the Completer. This instantiation can be seen, for example, in the edge [NP [`num`:feat:\ =\ `sg`:fval:] |rarr| PropN[`num`:feat:\ =\ `sg`:fval:] |dot|, (0, 1)] in Example 9.2, where the feature `num`:feat: has been assigned the value `sg`:fval:. Feature structures in NLTK are ... Atomic feature values can be strings or integers. >>> fs1 = nltk.FeatStruct(TENSE='past', NUM='sg') >>> print(fs1) [ NUM = 'sg' ] [ TENSE = 'past' ] We can think of a feature structure as being like a Python dictionary, and access its values by indexing in the usual way. >>> fs1 = nltk.FeatStruct(PER=3, NUM='pl', GND='fem') >>> print(fs1['GND']) fem We can also define feature structures which have complex values, as discussed earlier. >>> fs2 = nltk.FeatStruct(POS='N', AGR=fs1) >>> print(fs2) [ [ GND = 'fem' ] ] [ AGR = [ NUM = 'pl' ] ] [ [ PER = 3 ] ] [ ] [ POS = 'N' ] >>> print(fs2['AGR']) [ GND = 'fem' ] [ NUM = 'pl' ] [ PER = 3 ] >>> print(fs2['AGR']['PER']) 3 Feature structures can also be constructed using the ``parse()`` method of the ``nltk.FeatStruct`` class. Note that in this case, atomic feature values do not need to be enclosed in quotes. >>> f1 = nltk.FeatStruct("[NUMBER = sg]") >>> f2 = nltk.FeatStruct("[PERSON = 3]") >>> print(nltk.unify(f1, f2)) [ NUMBER = 'sg' ] [ PERSON = 3 ] >>> f1 = nltk.FeatStruct("[A = [B = b, D = d]]") >>> f2 = nltk.FeatStruct("[A = [C = c, D = d]]") >>> print(nltk.unify(f1, f2)) [ [ B = 'b' ] ] [ A = [ C = 'c' ] ] [ [ D = 'd' ] ] Feature Structures as Graphs ---------------------------- Feature structures are not inherently tied to linguistic objects; they are general purpose structures for representing knowledge. For example, we could encode information about a person in a feature structure: >>> person01 = nltk.FeatStruct("[NAME=Lee, TELNO='01 27 86 42 96',AGE=33]") >>> print(person01) [ AGE = 33 ] [ NAME = 'Lee' ] [ TELNO = '01 27 86 42 96' ] There are a number of notations for representing reentrancy in matrix-style representations of feature structures. In NLTK, we adopt the following convention: the first occurrence of a shared feature structure is prefixed with an integer in parentheses, such as ``(1)``, and any subsequent reference to that structure uses the notation ``->(1)``, as shown below. >>> fs = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""") >>> print(fs) [ ADDRESS = (1) [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ SPOUSE = [ ADDRESS -> (1) ] ] [ [ NAME = 'Kim' ] ] There can be any number of tags within a single feature structure. >>> fs3 = nltk.FeatStruct("[A=(1)[B=b], C=(2)[], D->(1), E->(2)]") >>> print(fs3) [ A = (1) [ B = 'b' ] ] [ ] [ C = (2) [] ] [ ] [ D -> (1) ] [ E -> (2) ] >>> fs1 = nltk.FeatStruct(NUMBER=74, STREET='rue Pascal') >>> fs2 = nltk.FeatStruct(CITY='Paris') >>> print(nltk.unify(fs1, fs2)) [ CITY = 'Paris' ] [ NUMBER = 74 ] [ STREET = 'rue Pascal' ] Unification is symmetric: >>> nltk.unify(fs1, fs2) == nltk.unify(fs2, fs1) True Unification is commutative: >>> fs3 = nltk.FeatStruct(TELNO='01 27 86 42 96') >>> nltk.unify(nltk.unify(fs1, fs2), fs3) == nltk.unify(fs1, nltk.unify(fs2, fs3)) True Unification between `FS`:math:\ :subscript:`0` and `FS`:math:\ :subscript:`1` will fail if the two feature structures share a path |pi|, but the value of |pi| in `FS`:math:\ :subscript:`0` is a distinct atom from the value of |pi| in `FS`:math:\ :subscript:`1`. In NLTK, this is implemented by setting the result of unification to be ``None``. >>> fs0 = nltk.FeatStruct(A='a') >>> fs1 = nltk.FeatStruct(A='b') >>> print(nltk.unify(fs0, fs1)) None Now, if we look at how unification interacts with structure-sharing, things become really interesting. >>> fs0 = nltk.FeatStruct("""[NAME=Lee, ... ADDRESS=[NUMBER=74, ... STREET='rue Pascal'], ... SPOUSE= [NAME=Kim, ... ADDRESS=[NUMBER=74, ... STREET='rue Pascal']]]""") >>> print(fs0) [ ADDRESS = [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ [ ADDRESS = [ NUMBER = 74 ] ] ] [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ] [ [ ] ] [ [ NAME = 'Kim' ] ] >>> fs1 = nltk.FeatStruct("[SPOUSE=[ADDRESS=[CITY=Paris]]]") >>> print(nltk.unify(fs0, fs1)) [ ADDRESS = [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ [ [ CITY = 'Paris' ] ] ] [ [ ADDRESS = [ NUMBER = 74 ] ] ] [ SPOUSE = [ [ STREET = 'rue Pascal' ] ] ] [ [ ] ] [ [ NAME = 'Kim' ] ] >>> fs2 = nltk.FeatStruct("""[NAME=Lee, ADDRESS=(1)[NUMBER=74, STREET='rue Pascal'], ... SPOUSE=[NAME=Kim, ADDRESS->(1)]]""") >>> print(fs2) [ ADDRESS = (1) [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ SPOUSE = [ ADDRESS -> (1) ] ] [ [ NAME = 'Kim' ] ] >>> print(nltk.unify(fs2, fs1)) [ [ CITY = 'Paris' ] ] [ ADDRESS = (1) [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ NAME = 'Lee' ] [ ] [ SPOUSE = [ ADDRESS -> (1) ] ] [ [ NAME = 'Kim' ] ] >>> fs1 = nltk.FeatStruct("[ADDRESS1=[NUMBER=74, STREET='rue Pascal']]") >>> fs2 = nltk.FeatStruct("[ADDRESS1=?x, ADDRESS2=?x]") >>> print(fs2) [ ADDRESS1 = ?x ] [ ADDRESS2 = ?x ] >>> print(nltk.unify(fs1, fs2)) [ ADDRESS1 = (1) [ NUMBER = 74 ] ] [ [ STREET = 'rue Pascal' ] ] [ ] [ ADDRESS2 -> (1) ] >>> sent = 'who do you claim that you like' >>> tokens = sent.split() >>> cp = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1) >>> trees = cp.parse(tokens) |.w.d.y.c.t.y.l.| |[-] . . . . . .| [0:1] 'who' |. [-] . . . . .| [1:2] 'do' |. . [-] . . . .| [2:3] 'you' |. . . [-] . . .| [3:4] 'claim' |. . . . [-] . .| [4:5] 'that' |. . . . . [-] .| [5:6] 'you' |. . . . . . [-]| [6:7] 'like' |# . . . . . . .| [0:0] NP[]/NP[] -> * |. # . . . . . .| [1:1] NP[]/NP[] -> * |. . # . . . . .| [2:2] NP[]/NP[] -> * |. . . # . . . .| [3:3] NP[]/NP[] -> * |. . . . # . . .| [4:4] NP[]/NP[] -> * |. . . . . # . .| [5:5] NP[]/NP[] -> * |. . . . . . # .| [6:6] NP[]/NP[] -> * |. . . . . . . #| [7:7] NP[]/NP[] -> * |[-] . . . . . .| [0:1] NP[+WH] -> 'who' * |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {} |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {} |. [-] . . . . .| [1:2] V[+AUX] -> 'do' * |. [-> . . . . .| [1:2] S[+INV] -> V[+AUX] * NP[] VP[] {} |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {} |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {} |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {} |. . [-] . . . .| [2:3] NP[-WH] -> 'you' * |. . [-> . . . .| [2:3] S[-INV] -> NP[] * VP[] {} |. . [-> . . . .| [2:3] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |. . [-> . . . .| [2:3] S[-INV] -> NP[] * S[]/NP[] {} |. [---> . . . .| [1:3] S[+INV] -> V[+AUX] NP[] * VP[] {} |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {} |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' * |. . . [-> . . .| [3:4] VP[] -> V[-AUX, SUBCAT='clause'] * SBar[] {} |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {} |. . . . [-] . .| [4:5] Comp[] -> 'that' * |. . . . [-> . .| [4:5] SBar[] -> Comp[] * S[-INV] {} |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {} |. . . . . [-] .| [5:6] NP[-WH] -> 'you' * |. . . . . [-> .| [5:6] S[-INV] -> NP[] * VP[] {} |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |. . . . . [-> .| [5:6] S[-INV] -> NP[] * S[]/NP[] {} |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' * |. . . . . . [->| [6:7] VP[] -> V[-AUX, SUBCAT='trans'] * NP[] {} |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {} |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] * |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] * |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] * |. . [---------]| [2:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] * |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] * >>> trees = list(trees) >>> for tree in trees: print(tree) (S[-INV] (NP[+WH] who) (S[+INV]/NP[] (V[+AUX] do) (NP[-WH] you) (VP[]/NP[] (V[-AUX, SUBCAT='clause'] claim) (SBar[]/NP[] (Comp[] that) (S[-INV]/NP[] (NP[-WH] you) (VP[]/NP[] (V[-AUX, SUBCAT='trans'] like) (NP[]/NP[] ))))))) A different parser should give the same parse trees, but perhaps in a different order: >>> cp2 = parse.load_parser('grammars/book_grammars/feat1.fcfg', trace=1, ... parser=parse.FeatureEarleyChartParser) >>> trees2 = cp2.parse(tokens) |.w.d.y.c.t.y.l.| |[-] . . . . . .| [0:1] 'who' |. [-] . . . . .| [1:2] 'do' |. . [-] . . . .| [2:3] 'you' |. . . [-] . . .| [3:4] 'claim' |. . . . [-] . .| [4:5] 'that' |. . . . . [-] .| [5:6] 'you' |. . . . . . [-]| [6:7] 'like' |> . . . . . . .| [0:0] S[-INV] -> * NP[] VP[] {} |> . . . . . . .| [0:0] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} |> . . . . . . .| [0:0] S[-INV] -> * NP[] S[]/NP[] {} |> . . . . . . .| [0:0] S[-INV] -> * Adv[+NEG] S[+INV] {} |> . . . . . . .| [0:0] S[+INV] -> * V[+AUX] NP[] VP[] {} |> . . . . . . .| [0:0] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {} |> . . . . . . .| [0:0] NP[+WH] -> * 'who' {} |[-] . . . . . .| [0:1] NP[+WH] -> 'who' * |[-> . . . . . .| [0:1] S[-INV] -> NP[] * VP[] {} |[-> . . . . . .| [0:1] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |[-> . . . . . .| [0:1] S[-INV] -> NP[] * S[]/NP[] {} |. > . . . . . .| [1:1] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} |. > . . . . . .| [1:1] S[+INV]/?x[] -> * V[+AUX] NP[] VP[]/?x[] {} |. > . . . . . .| [1:1] V[+AUX] -> * 'do' {} |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} |. > . . . . . .| [1:1] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} |. > . . . . . .| [1:1] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='intrans'] {} |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {} |. > . . . . . .| [1:1] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {} |. > . . . . . .| [1:1] VP[] -> * V[+AUX] VP[] {} |. [-] . . . . .| [1:2] V[+AUX] -> 'do' * |. [-> . . . . .| [1:2] S[+INV]/?x[] -> V[+AUX] * NP[] VP[]/?x[] {} |. [-> . . . . .| [1:2] VP[]/?x[] -> V[+AUX] * VP[]/?x[] {} |. [-> . . . . .| [1:2] VP[] -> V[+AUX] * VP[] {} |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='intrans'] {} |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='trans'] NP[] {} |. . > . . . . .| [2:2] VP[] -> * V[-AUX, SUBCAT='clause'] SBar[] {} |. . > . . . . .| [2:2] VP[] -> * V[+AUX] VP[] {} |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} |. . > . . . . .| [2:2] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} |. . > . . . . .| [2:2] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} |. . > . . . . .| [2:2] NP[-WH] -> * 'you' {} |. . [-] . . . .| [2:3] NP[-WH] -> 'you' * |. [---> . . . .| [1:3] S[+INV]/?x[] -> V[+AUX] NP[] * VP[]/?x[] {} |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} |. . . > . . . .| [3:3] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} |. . . > . . . .| [3:3] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} |. . . > . . . .| [3:3] V[-AUX, SUBCAT='clause'] -> * 'claim' {} |. . . [-] . . .| [3:4] V[-AUX, SUBCAT='clause'] -> 'claim' * |. . . [-> . . .| [3:4] VP[]/?x[] -> V[-AUX, SUBCAT='clause'] * SBar[]/?x[] {} |. . . . > . . .| [4:4] SBar[]/?x[] -> * Comp[] S[-INV]/?x[] {} |. . . . > . . .| [4:4] Comp[] -> * 'that' {} |. . . . [-] . .| [4:5] Comp[] -> 'that' * |. . . . [-> . .| [4:5] SBar[]/?x[] -> Comp[] * S[-INV]/?x[] {} |. . . . . > . .| [5:5] S[-INV]/?x[] -> * NP[] VP[]/?x[] {} |. . . . . > . .| [5:5] NP[-WH] -> * 'you' {} |. . . . . [-] .| [5:6] NP[-WH] -> 'you' * |. . . . . [-> .| [5:6] S[-INV]/?x[] -> NP[] * VP[]/?x[] {} |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='trans'] NP[]/?x[] {} |. . . . . . > .| [6:6] VP[]/?x[] -> * V[-AUX, SUBCAT='clause'] SBar[]/?x[] {} |. . . . . . > .| [6:6] VP[]/?x[] -> * V[+AUX] VP[]/?x[] {} |. . . . . . > .| [6:6] V[-AUX, SUBCAT='trans'] -> * 'like' {} |. . . . . . [-]| [6:7] V[-AUX, SUBCAT='trans'] -> 'like' * |. . . . . . [->| [6:7] VP[]/?x[] -> V[-AUX, SUBCAT='trans'] * NP[]/?x[] {} |. . . . . . . #| [7:7] NP[]/NP[] -> * |. . . . . . [-]| [6:7] VP[]/NP[] -> V[-AUX, SUBCAT='trans'] NP[]/NP[] * |. . . . . [---]| [5:7] S[-INV]/NP[] -> NP[] VP[]/NP[] * |. . . . [-----]| [4:7] SBar[]/NP[] -> Comp[] S[-INV]/NP[] * |. . . [-------]| [3:7] VP[]/NP[] -> V[-AUX, SUBCAT='clause'] SBar[]/NP[] * |. [-----------]| [1:7] S[+INV]/NP[] -> V[+AUX] NP[] VP[]/NP[] * |[=============]| [0:7] S[-INV] -> NP[] S[]/NP[] * >>> sorted(trees) == sorted(trees2) True Let's load a German grammar: >>> cp = parse.load_parser('grammars/book_grammars/german.fcfg', trace=0) >>> sent = 'die Katze sieht den Hund' >>> tokens = sent.split() >>> trees = cp.parse(tokens) >>> for tree in trees: print(tree) (S[] (NP[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] (Det[AGR=[GND='fem', NUM='sg', PER=3], CASE='nom'] die) (N[AGR=[GND='fem', NUM='sg', PER=3]] Katze)) (VP[AGR=[NUM='sg', PER=3]] (TV[AGR=[NUM='sg', PER=3], OBJCASE='acc'] sieht) (NP[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] (Det[AGR=[GND='masc', NUM='sg', PER=3], CASE='acc'] den) (N[AGR=[GND='masc', NUM='sg', PER=3]] Hund)))) Grammar with Binding Operators ------------------------------ The `bindop.fcfg`_ grammar is a semantic grammar that uses lambda calculus. Each element has a core semantics, which is a single lambda calculus expression; and a set of binding operators, which bind variables. .. _bindop.fcfg: http://nltk.svn.sourceforge.net/svnroot/nltk/trunk/nltk/data/grammars/bindop.fcfg In order to make the binding operators work right, they need to instantiate their bound variable every time they are added to the chart. To do this, we use a special subclass of `Chart`, called `InstantiateVarsChart`. >>> from nltk.parse.featurechart import InstantiateVarsChart >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=1, ... chart_class=InstantiateVarsChart) >>> print(cp.grammar()) Grammar with 15 productions (start state = S[]) S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] VP[SEM=[BO=?b2, CORE=?vp]] VP[SEM=[BO={?b1+?b2}, CORE=]] -> TV[SEM=[BO=?b1, CORE=?v]] NP[SEM=[BO=?b2, CORE=?obj]] VP[SEM=?s] -> IV[SEM=?s] NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] N[SEM=[BO=?b2, CORE=?n]] Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' N[SEM=[BO={/}, CORE=]] -> 'dog' N[SEM=[BO={/}, CORE=]] -> 'cat' N[SEM=[BO={/}, CORE=]] -> 'mouse' IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'eats' IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'walks' TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'walks' NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'john' NP[SEM=[BO={bo(\P.P(John),@x)}, CORE=<@x>]] -> 'alex' A simple intransitive sentence: >>> from nltk.sem import logic >>> logic._counter._value = 100 >>> trees = cp.parse('john barks'.split()) |. john.barks.| |[-----] .| [0:1] 'john' |. [-----]| [1:2] 'barks' |[-----] .| [0:1] NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=]] -> 'john' * |[-----> .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: } |. [-----]| [1:2] IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> 'barks' * |. [-----]| [1:2] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] -> IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] * |[===========]| [0:2] S[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] * >>> for tree in trees: print(tree) (S[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] (NP[SEM=[BO={bo(\P.P(John),z101)}, CORE=]] john) (VP[SEM=[BO={/}, CORE=<\x.bark(x)>]] (IV[SEM=[BO={/}, CORE=<\x.bark(x)>]] barks))) A transitive sentence: >>> trees = cp.parse('john feeds a dog'.split()) |.joh.fee. a .dog.| |[---] . . .| [0:1] 'john' |. [---] . .| [1:2] 'feeds' |. . [---] .| [2:3] 'a' |. . . [---]| [3:4] 'dog' |[---] . . .| [0:1] NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=]] -> 'john' * |[---> . . .| [0:1] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.P(John),z2)}, ?subj: } |. [---] . .| [1:2] TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] -> 'feeds' * |. [---> . .| [1:2] VP[SEM=[BO={?b1+?b2}, CORE=]] -> TV[SEM=[BO=?b1, CORE=?v]] * NP[SEM=[BO=?b2, CORE=?obj]] {?b1: {/}, ?v: } |. . [---] .| [2:3] Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] -> 'a' * |. . [---> .| [2:3] NP[SEM=[BO={?b1+?b2+{bo(?det(?n),@x)}}, CORE=<@x>]] -> Det[SEM=[BO=?b1, CORE=?det]] * N[SEM=[BO=?b2, CORE=?n]] {?b1: {/}, ?det: } |. . . [---]| [3:4] N[SEM=[BO={/}, CORE=]] -> 'dog' * |. . [-------]| [2:4] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=]] -> Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] N[SEM=[BO={/}, CORE=]] * |. . [------->| [2:4] S[SEM=[BO={?b1+?b2}, CORE=]] -> NP[SEM=[BO=?b1, CORE=?subj]] * VP[SEM=[BO=?b2, CORE=?vp]] {?b1: {bo(\P.exists x.(dog(x) & P(x)),z2)}, ?subj: } |. [-----------]| [1:4] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] -> TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=]] * |[===============]| [0:4] S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] -> NP[SEM=[BO={bo(\P.P(John),z2)}, CORE=]] VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=<\y.feed(y,z3)>]] * >>> for tree in trees: print(tree) (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] (NP[SEM=[BO={bo(\P.P(John),z102)}, CORE=]] john) (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z103)}, CORE=]] (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) (N[SEM=[BO={/}, CORE=]] dog)))) Turn down the verbosity: >>> cp = parse.load_parser('grammars/sample_grammars/bindop.fcfg', trace=0, ... chart_class=InstantiateVarsChart) Reuse the same lexical item twice: >>> trees = cp.parse('john feeds john'.split()) >>> for tree in trees: print(tree) (S[SEM=[BO={bo(\P.P(John),z2), bo(\P.P(John),z3)}, CORE=]] (NP[SEM=[BO={bo(\P.P(John),z104)}, CORE=]] john) (VP[SEM=[BO={bo(\P.P(John),z2)}, CORE=<\y.feed(y,z2)>]] (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) (NP[SEM=[BO={bo(\P.P(John),z105)}, CORE=]] john))) >>> trees = cp.parse('a dog feeds a dog'.split()) >>> for tree in trees: print(tree) (S[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2), bo(\P.exists x.(dog(x) & P(x)),z3)}, CORE=]] (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z106)}, CORE=]] (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) (N[SEM=[BO={/}, CORE=]] dog)) (VP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z2)}, CORE=<\y.feed(y,z2)>]] (TV[SEM=[BO={/}, CORE=<\x y.feed(y,x)>]] feeds) (NP[SEM=[BO={bo(\P.exists x.(dog(x) & P(x)),z107)}, CORE=]] (Det[SEM=[BO={/}, CORE=<\Q P.exists x.(Q(x) & P(x))>]] a) (N[SEM=[BO={/}, CORE=]] dog)))) nltk-3.1/nltk/test/featstruct.doctest0000644000076500000240000011152012607224144017560 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ================================== Feature Structures & Unification ================================== >>> from __future__ import print_function >>> from nltk.featstruct import FeatStruct >>> from nltk.sem.logic import Variable, VariableExpression, Expression .. note:: For now, featstruct uses the older lambdalogic semantics module. Eventually, it should be updated to use the new first order predicate logic module. Overview ~~~~~~~~ A feature structure is a mapping from feature identifiers to feature values, where feature values can be simple values (like strings or ints), nested feature structures, or variables: >>> fs1 = FeatStruct(number='singular', person=3) >>> print(fs1) [ number = 'singular' ] [ person = 3 ] Feature structure may be nested: >>> fs2 = FeatStruct(type='NP', agr=fs1) >>> print(fs2) [ agr = [ number = 'singular' ] ] [ [ person = 3 ] ] [ ] [ type = 'NP' ] Variables are used to indicate that two features should be assigned the same value. For example, the following feature structure requires that the feature fs3['agr']['number'] be bound to the same value as the feature fs3['subj']['number']. >>> fs3 = FeatStruct(agr=FeatStruct(number=Variable('?n')), ... subj=FeatStruct(number=Variable('?n'))) >>> print(fs3) [ agr = [ number = ?n ] ] [ ] [ subj = [ number = ?n ] ] Feature structures are typically used to represent partial information about objects. A feature name that is not mapped to a value stands for a feature whose value is unknown (*not* a feature without a value). Two feature structures that represent (potentially overlapping) information about the same object can be combined by *unification*. >>> print(fs2.unify(fs3)) [ agr = [ number = 'singular' ] ] [ [ person = 3 ] ] [ ] [ subj = [ number = 'singular' ] ] [ ] [ type = 'NP' ] When two inconsistent feature structures are unified, the unification fails and returns ``None``. >>> fs4 = FeatStruct(agr=FeatStruct(person=1)) >>> print(fs4.unify(fs2)) None >>> print(fs2.unify(fs4)) None .. >>> del fs1, fs2, fs3, fs4 # clean-up Feature Structure Types ----------------------- There are actually two types of feature structure: - *feature dictionaries*, implemented by `FeatDict`, act like Python dictionaries. Feature identifiers may be strings or instances of the `Feature` class. - *feature lists*, implemented by `FeatList`, act like Python lists. Feature identifiers are integers. When you construct a feature structure using the `FeatStruct` constructor, it will automatically decide which type is appropriate: >>> type(FeatStruct(number='singular')) >>> type(FeatStruct([1,2,3])) Usually, we will just use feature dictionaries; but sometimes feature lists can be useful too. Two feature lists will unify with each other only if they have equal lengths, and all of their feature values match. If you wish to write a feature list that contains 'unknown' values, you must use variables: >>> fs1 = FeatStruct([1,2,Variable('?y')]) >>> fs2 = FeatStruct([1,Variable('?x'),3]) >>> fs1.unify(fs2) [1, 2, 3] .. >>> del fs1, fs2 # clean-up Parsing Feature Structure Strings --------------------------------- Feature structures can be constructed directly from strings. Often, this is more convenient than constructing them directly. NLTK can parse most feature strings to produce the corresponding feature structures. (But you must restrict your base feature values to strings, ints, logic expressions (`nltk.sem.logic.Expression`), and a few other types discussed below). Feature dictionaries are written like Python dictionaries, except that keys are not put in quotes; and square brackets (``[]``) are used instead of braces (``{}``): >>> FeatStruct('[tense="past", agr=[number="sing", person=3]]') [agr=[number='sing', person=3], tense='past'] If a feature value is a single alphanumeric word, then it does not need to be quoted -- it will be automatically treated as a string: >>> FeatStruct('[tense=past, agr=[number=sing, person=3]]') [agr=[number='sing', person=3], tense='past'] Feature lists are written like python lists: >>> FeatStruct('[1, 2, 3]') [1, 2, 3] The expression ``[]`` is treated as an empty feature dictionary, not an empty feature list: >>> type(FeatStruct('[]')) Feature Paths ------------- Features can be specified using *feature paths*, or tuples of feature identifiers that specify path through the nested feature structures to a value. >>> fs1 = FeatStruct('[x=1, y=[1,2,[z=3]]]') >>> fs1['y'] [1, 2, [z=3]] >>> fs1['y', 2] [z=3] >>> fs1['y', 2, 'z'] 3 .. >>> del fs1 # clean-up Reentrance ---------- Feature structures may contain reentrant feature values. A *reentrant feature value* is a single feature structure that can be accessed via multiple feature paths. >>> fs1 = FeatStruct(x='val') >>> fs2 = FeatStruct(a=fs1, b=fs1) >>> print(fs2) [ a = (1) [ x = 'val' ] ] [ ] [ b -> (1) ] >>> fs2 [a=(1)[x='val'], b->(1)] As you can see, reentrane is displayed by marking a feature structure with a unique identifier, in this case ``(1)``, the first time it is encountered; and then using the special form ``var -> id`` whenever it is encountered again. You can use the same notation to directly create reentrant feature structures from strings. >>> FeatStruct('[a=(1)[], b->(1), c=[d->(1)]]') [a=(1)[], b->(1), c=[d->(1)]] Reentrant feature structures may contain cycles: >>> fs3 = FeatStruct('(1)[a->(1)]') >>> fs3['a', 'a', 'a', 'a'] (1)[a->(1)] >>> fs3['a', 'a', 'a', 'a'] is fs3 True Unification preserves the reentrance relations imposed by both of the unified feature structures. In the feature structure resulting from unification, any modifications to a reentrant feature value will be visible using any of its feature paths. >>> fs3.unify(FeatStruct('[a=[b=12], c=33]')) (1)[a->(1), b=12, c=33] .. >>> del fs1, fs2, fs3 # clean-up Feature Structure Equality -------------------------- Two feature structures are considered equal if they assign the same values to all features, *and* they contain the same reentrances. >>> fs1 = FeatStruct('[a=(1)[x=1], b->(1)]') >>> fs2 = FeatStruct('[a=(1)[x=1], b->(1)]') >>> fs3 = FeatStruct('[a=[x=1], b=[x=1]]') >>> fs1 == fs1, fs1 is fs1 (True, True) >>> fs1 == fs2, fs1 is fs2 (True, False) >>> fs1 == fs3, fs1 is fs3 (False, False) Note that this differs from how Python dictionaries and lists define equality -- in particular, Python dictionaries and lists ignore reentrance relations. To test two feature structures for equality while ignoring reentrance relations, use the `equal_values()` method: >>> fs1.equal_values(fs1) True >>> fs1.equal_values(fs2) True >>> fs1.equal_values(fs3) True .. >>> del fs1, fs2, fs3 # clean-up Feature Value Sets & Feature Value Tuples ----------------------------------------- `nltk.featstruct` defines two new data types that are intended to be used as feature values: `FeatureValueTuple` and `FeatureValueSet`. Both of these types are considered base values -- i.e., unification does *not* apply to them. However, variable binding *does* apply to any values that they contain. Feature value tuples are written with parentheses: >>> fs1 = FeatStruct('[x=(?x, ?y)]') >>> fs1 [x=(?x, ?y)] >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2}) [x=(1, 2)] Feature sets are written with braces: >>> fs1 = FeatStruct('[x={?x, ?y}]') >>> fs1 [x={?x, ?y}] >>> fs1.substitute_bindings({Variable('?x'): 1, Variable('?y'): 2}) [x={1, 2}] In addition to the basic feature value tuple & set classes, nltk defines feature value unions (for sets) and feature value concatenations (for tuples). These are written using '+', and can be used to combine sets & tuples: >>> fs1 = FeatStruct('[x=((1, 2)+?z), z=?z]') >>> fs1 [x=((1, 2)+?z), z=?z] >>> fs1.unify(FeatStruct('[z=(3, 4, 5)]')) [x=(1, 2, 3, 4, 5), z=(3, 4, 5)] Thus, feature value tuples and sets can be used to build up tuples and sets of values over the corse of unification. For example, when parsing sentences using a semantic feature grammar, feature sets or feature tuples can be used to build a list of semantic predicates as the sentence is parsed. As was mentioned above, unification does not apply to feature value tuples and sets. One reason for this that it's impossible to define a single correct answer for unification when concatenation is used. Consider the following example: >>> fs1 = FeatStruct('[x=(1, 2, 3, 4)]') >>> fs2 = FeatStruct('[x=(?a+?b), a=?a, b=?b]') If unification applied to feature tuples, then the unification algorithm would have to arbitrarily choose how to divide the tuple (1,2,3,4) into two parts. Instead, the unification algorithm refuses to make this decision, and simply unifies based on value. Because (1,2,3,4) is not equal to (?a+?b), fs1 and fs2 will not unify: >>> print(fs1.unify(fs2)) None If you need a list-like structure that unification does apply to, use `FeatList`. .. >>> del fs1, fs2 # clean-up Light-weight Feature Structures ------------------------------- Many of the functions defined by `nltk.featstruct` can be applied directly to simple Python dictionaries and lists, rather than to full-fledged `FeatDict` and `FeatList` objects. In other words, Python ``dicts`` and ``lists`` can be used as "light-weight" feature structures. >>> # Note: pprint prints dicts sorted >>> from pprint import pprint >>> from nltk.featstruct import unify >>> pprint(unify(dict(x=1, y=dict()), dict(a='a', y=dict(b='b')))) {'a': 'a', 'x': 1, 'y': {'b': 'b'}} However, you should keep in mind the following caveats: - Python dictionaries & lists ignore reentrance when checking for equality between values. But two FeatStructs with different reentrances are considered nonequal, even if all their base values are equal. - FeatStructs can be easily frozen, allowing them to be used as keys in hash tables. Python dictionaries and lists can not. - FeatStructs display reentrance in their string representations; Python dictionaries and lists do not. - FeatStructs may *not* be mixed with Python dictionaries and lists (e.g., when performing unification). - FeatStructs provide a number of useful methods, such as `walk()` and `cyclic()`, which are not available for Python dicts & lists. In general, if your feature structures will contain any reentrances, or if you plan to use them as dictionary keys, it is strongly recommended that you use full-fledged `FeatStruct` objects. Custom Feature Values --------------------- The abstract base class `CustomFeatureValue` can be used to define new base value types that have custom unification methods. For example, the following feature value type encodes a range, and defines unification as taking the intersection on the ranges: >>> from nltk.compat import total_ordering >>> from nltk.featstruct import CustomFeatureValue, UnificationFailure >>> @total_ordering ... class Range(CustomFeatureValue): ... def __init__(self, low, high): ... assert low <= high ... self.low = low ... self.high = high ... def unify(self, other): ... if not isinstance(other, Range): ... return UnificationFailure ... low = max(self.low, other.low) ... high = min(self.high, other.high) ... if low <= high: return Range(low, high) ... else: return UnificationFailure ... def __repr__(self): ... return '(%s>> fs1 = FeatStruct(x=Range(5,8), y=FeatStruct(z=Range(7,22))) >>> print(fs1.unify(FeatStruct(x=Range(6, 22)))) [ x = (6>> print(fs1.unify(FeatStruct(x=Range(9, 12)))) None >>> print(fs1.unify(FeatStruct(x=12))) None >>> print(fs1.unify(FeatStruct('[x=?x, y=[z=?x]]'))) [ x = (7>> fs1 = FeatStruct(a=1, b=2, c=3) >>> fs2 = FeatStruct(x=fs1, y='x') Feature structures support all dictionary methods (excluding the class method `dict.fromkeys()`). Non-mutating methods: >>> sorted(fs2.keys()) # keys() ['x', 'y'] >>> sorted(fs2.values()) # values() [[a=1, b=2, c=3], 'x'] >>> sorted(fs2.items()) # items() [('x', [a=1, b=2, c=3]), ('y', 'x')] >>> sorted(fs2) # __iter__() ['x', 'y'] >>> 'a' in fs2, 'x' in fs2 # __contains__() (False, True) >>> fs2.has_key('a'), fs2.has_key('x') # has_key() (False, True) >>> fs2['x'], fs2['y'] # __getitem__() ([a=1, b=2, c=3], 'x') >>> fs2['a'] # __getitem__() Traceback (most recent call last): . . . KeyError: 'a' >>> fs2.get('x'), fs2.get('y'), fs2.get('a') # get() ([a=1, b=2, c=3], 'x', None) >>> fs2.get('x', 'hello'), fs2.get('a', 'hello') # get() ([a=1, b=2, c=3], 'hello') >>> len(fs1), len(fs2) # __len__ (3, 2) >>> fs2.copy() # copy() [x=[a=1, b=2, c=3], y='x'] >>> fs2.copy() is fs2 # copy() False Note: by default, `FeatStruct.copy()` does a deep copy. Use `FeatStruct.copy(deep=False)` for a shallow copy. .. >>> del fs1, fs2 # clean-up. Dictionary access methods (mutating) ------------------------------------ >>> fs1 = FeatStruct(a=1, b=2, c=3) >>> fs2 = FeatStruct(x=fs1, y='x') Setting features (`__setitem__()`) >>> fs1['c'] = 5 >>> fs1 [a=1, b=2, c=5] >>> fs1['x'] = 12 >>> fs1 [a=1, b=2, c=5, x=12] >>> fs2['x', 'a'] = 2 >>> fs2 [x=[a=2, b=2, c=5, x=12], y='x'] >>> fs1 [a=2, b=2, c=5, x=12] Deleting features (`__delitem__()`) >>> del fs1['x'] >>> fs1 [a=2, b=2, c=5] >>> del fs2['x', 'a'] >>> fs1 [b=2, c=5] `setdefault()`: >>> fs1.setdefault('b', 99) 2 >>> fs1 [b=2, c=5] >>> fs1.setdefault('x', 99) 99 >>> fs1 [b=2, c=5, x=99] `update()`: >>> fs2.update({'a':'A', 'b':'B'}, c='C') >>> fs2 [a='A', b='B', c='C', x=[b=2, c=5, x=99], y='x'] `pop()`: >>> fs2.pop('a') 'A' >>> fs2 [b='B', c='C', x=[b=2, c=5, x=99], y='x'] >>> fs2.pop('a') Traceback (most recent call last): . . . KeyError: 'a' >>> fs2.pop('a', 'foo') 'foo' >>> fs2 [b='B', c='C', x=[b=2, c=5, x=99], y='x'] `clear()`: >>> fs1.clear() >>> fs1 [] >>> fs2 [b='B', c='C', x=[], y='x'] `popitem()`: >>> sorted([fs2.popitem() for i in range(len(fs2))]) [('b', 'B'), ('c', 'C'), ('x', []), ('y', 'x')] >>> fs2 [] Once a feature structure has been frozen, it may not be mutated. >>> fs1 = FeatStruct('[x=1, y=2, z=[a=3]]') >>> fs1.freeze() >>> fs1.frozen() True >>> fs1['z'].frozen() True >>> fs1['x'] = 5 Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> del fs1['x'] Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.clear() Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.pop('x') Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.popitem() Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.setdefault('x') Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. >>> fs1.update(z=22) Traceback (most recent call last): . . . ValueError: Frozen FeatStructs may not be modified. .. >>> del fs1, fs2 # clean-up. Feature Paths ------------- Make sure that __getitem__ with feature paths works as intended: >>> fs1 = FeatStruct(a=1, b=2, ... c=FeatStruct( ... d=FeatStruct(e=12), ... f=FeatStruct(g=55, h='hello'))) >>> fs1[()] [a=1, b=2, c=[d=[e=12], f=[g=55, h='hello']]] >>> fs1['a'], fs1[('a',)] (1, 1) >>> fs1['c','d','e'] 12 >>> fs1['c','f','g'] 55 Feature paths that select unknown features raise KeyError: >>> fs1['c', 'f', 'e'] Traceback (most recent call last): . . . KeyError: ('c', 'f', 'e') >>> fs1['q', 'p'] Traceback (most recent call last): . . . KeyError: ('q', 'p') Feature paths that try to go 'through' a feature that's not a feature structure raise KeyError: >>> fs1['a', 'b'] Traceback (most recent call last): . . . KeyError: ('a', 'b') Feature paths can go through reentrant structures: >>> fs2 = FeatStruct('(1)[a=[b=[c->(1), d=5], e=11]]') >>> fs2['a', 'b', 'c', 'a', 'e'] 11 >>> fs2['a', 'b', 'c', 'a', 'b', 'd'] 5 >>> fs2[tuple('abcabcabcabcabcabcabcabcabcabca')] (1)[b=[c=[a->(1)], d=5], e=11] Indexing requires strings, `Feature`\s, or tuples; other types raise a TypeError: >>> fs2[12] Traceback (most recent call last): . . . TypeError: Expected feature name or path. Got 12. >>> fs2[list('abc')] Traceback (most recent call last): . . . TypeError: Expected feature name or path. Got ['a', 'b', 'c']. Feature paths can also be used with `get()`, `has_key()`, and `__contains__()`. >>> fpath1 = tuple('abcabc') >>> fpath2 = tuple('abcabz') >>> fs2.get(fpath1), fs2.get(fpath2) ((1)[a=[b=[c->(1), d=5], e=11]], None) >>> fpath1 in fs2, fpath2 in fs2 (True, False) >>> fs2.has_key(fpath1), fs2.has_key(fpath2) (True, False) .. >>> del fs1, fs2 # clean-up Reading Feature Structures -------------------------- Empty feature struct: >>> FeatStruct('[]') [] Test features with integer values: >>> FeatStruct('[a=12, b=-33, c=0]') [a=12, b=-33, c=0] Test features with string values. Either single or double quotes may be used. Strings are evaluated just like python strings -- in particular, you can use escape sequences and 'u' and 'r' prefixes, and triple-quoted strings. >>> FeatStruct('[a="", b="hello", c="\'", d=\'\', e=\'"\']') [a='', b='hello', c="'", d='', e='"'] >>> FeatStruct(r'[a="\\", b="\"", c="\x6f\\y", d="12"]') [a='\\', b='"', c='o\\y', d='12'] >>> FeatStruct(r'[b=r"a\b\c"]') [b='a\\b\\c'] >>> FeatStruct('[x="""a"""]') [x='a'] Test parsing of reentrant feature structures. >>> FeatStruct('[a=(1)[], b->(1)]') [a=(1)[], b->(1)] >>> FeatStruct('[a=(1)[x=1, y=2], b->(1)]') [a=(1)[x=1, y=2], b->(1)] Test parsing of cyclic feature structures. >>> FeatStruct('[a=(1)[b->(1)]]') [a=(1)[b->(1)]] >>> FeatStruct('(1)[a=[b=[c->(1)]]]') (1)[a=[b=[c->(1)]]] Strings of the form "+name" and "-name" may be used to specify boolean values. >>> FeatStruct('[-bar, +baz, +foo]') [-bar, +baz, +foo] None, True, and False are recognized as values: >>> FeatStruct('[bar=True, baz=False, foo=None]') [+bar, -baz, foo=None] Special features: >>> FeatStruct('NP/VP') NP[]/VP[] >>> FeatStruct('?x/?x') ?x[]/?x[] >>> print(FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]')) [ *type* = 'VP' ] [ ] [ [ *type* = 'NP' ] ] [ *slash* = [ agr = ?x ] ] [ [ pl = True ] ] [ ] [ agr = ?x ] [ fin = True ] [ tense = 'past' ] Here the slash feature gets coerced: >>> FeatStruct('[*slash*=a, x=b, *type*="NP"]') NP[x='b']/a[] >>> FeatStruct('NP[sem=]/NP') NP[sem=]/NP[] >>> FeatStruct('S[sem=]') S[sem=] >>> print(FeatStruct('NP[sem=]/NP')) [ *type* = 'NP' ] [ ] [ *slash* = [ *type* = 'NP' ] ] [ ] [ sem = ] Playing with ranges: >>> from nltk.featstruct import RangeFeature, FeatStructReader >>> width = RangeFeature('width') >>> reader = FeatStructReader([width]) >>> fs1 = reader.fromstring('[*width*=-5:12]') >>> fs2 = reader.fromstring('[*width*=2:123]') >>> fs3 = reader.fromstring('[*width*=-7:-2]') >>> fs1.unify(fs2) [*width*=(2, 12)] >>> fs1.unify(fs3) [*width*=(-5, -2)] >>> print(fs2.unify(fs3)) # no overlap in width. None The slash feature has a default value of 'False': >>> print(FeatStruct('NP[]/VP').unify(FeatStruct('NP[]'), trace=1)) Unification trace: / NP[]/VP[] |\ NP[] | | Unify feature: *type* | / 'NP' | |\ 'NP' | | | +-->'NP' | | Unify feature: *slash* | / VP[] | |\ False | | X X <-- FAIL None The demo structures from category.py. They all parse, but they don't do quite the right thing, -- ?x vs x. >>> FeatStruct(pos='n', agr=FeatStruct(number='pl', gender='f')) [agr=[gender='f', number='pl'], pos='n'] >>> FeatStruct(r'NP[sem=]/NP') NP[sem=]/NP[] >>> FeatStruct(r'S[sem=]') S[sem=] >>> FeatStruct('?x/?x') ?x[]/?x[] >>> FeatStruct('VP[+fin, agr=?x, tense=past]/NP[+pl, agr=?x]') VP[agr=?x, +fin, tense='past']/NP[agr=?x, +pl] >>> FeatStruct('S[sem = ]') S[sem=] >>> FeatStruct('S') S[] The parser also includes support for reading sets and tuples. >>> FeatStruct('[x={1,2,2,2}, y={/}]') [x={1, 2}, y={/}] >>> FeatStruct('[x=(1,2,2,2), y=()]') [x=(1, 2, 2, 2), y=()] >>> print(FeatStruct('[x=(1,[z=(1,2,?x)],?z,{/})]')) [ x = (1, [ z = (1, 2, ?x) ], ?z, {/}) ] Note that we can't put a featstruct inside a tuple, because doing so would hash it, and it's not frozen yet: >>> print(FeatStruct('[x={[]}]')) Traceback (most recent call last): . . . TypeError: FeatStructs must be frozen before they can be hashed. There's a special syntax for taking the union of sets: "{...+...}". The elements should only be variables or sets. >>> FeatStruct('[x={?a+?b+{1,2,3}}]') [x={?a+?b+{1, 2, 3}}] There's a special syntax for taking the concatenation of tuples: "(...+...)". The elements should only be variables or tuples. >>> FeatStruct('[x=(?a+?b+(1,2,3))]') [x=(?a+?b+(1, 2, 3))] Parsing gives helpful messages if your string contains an error. >>> FeatStruct('[a=, b=5]]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [a=, b=5]] ^ Expected value >>> FeatStruct('[a=12 22, b=33]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [a=12 22, b=33] ^ Expected comma >>> FeatStruct('[a=5] [b=6]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [a=5] [b=6] ^ Expected end of string >>> FeatStruct(' *++*') Traceback (most recent call last): . . . ValueError: Error parsing feature structure *++* ^ Expected open bracket or identifier >>> FeatStruct('[x->(1)]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [x->(1)] ^ Expected bound identifier >>> FeatStruct('[x->y]') Traceback (most recent call last): . . . ValueError: Error parsing feature structure [x->y] ^ Expected identifier >>> FeatStruct('') Traceback (most recent call last): . . . ValueError: Error parsing feature structure ^ Expected open bracket or identifier Unification ----------- Very simple unifications give the expected results: >>> FeatStruct().unify(FeatStruct()) [] >>> FeatStruct(number='singular').unify(FeatStruct()) [number='singular'] >>> FeatStruct().unify(FeatStruct(number='singular')) [number='singular'] >>> FeatStruct(number='singular').unify(FeatStruct(person=3)) [number='singular', person=3] Merging nested structures: >>> fs1 = FeatStruct('[A=[B=b]]') >>> fs2 = FeatStruct('[A=[C=c]]') >>> fs1.unify(fs2) [A=[B='b', C='c']] >>> fs2.unify(fs1) [A=[B='b', C='c']] A basic case of reentrant unification >>> fs4 = FeatStruct('[A=(1)[B=b], E=[F->(1)]]') >>> fs5 = FeatStruct("[A=[C='c'], E=[F=[D='d']]]") >>> fs4.unify(fs5) [A=(1)[B='b', C='c', D='d'], E=[F->(1)]] >>> fs5.unify(fs4) [A=(1)[B='b', C='c', D='d'], E=[F->(1)]] More than 2 paths to a value >>> fs1 = FeatStruct("[a=[],b=[],c=[],d=[]]") >>> fs2 = FeatStruct('[a=(1)[], b->(1), c->(1), d->(1)]') >>> fs1.unify(fs2) [a=(1)[], b->(1), c->(1), d->(1)] fs1[a] gets unified with itself >>> fs1 = FeatStruct('[x=(1)[], y->(1)]') >>> fs2 = FeatStruct('[x=(1)[], y->(1)]') >>> fs1.unify(fs2) [x=(1)[], y->(1)] Bound variables should get forwarded appropriately >>> fs1 = FeatStruct('[A=(1)[X=x], B->(1), C=?cvar, D=?dvar]') >>> fs2 = FeatStruct('[A=(1)[Y=y], B=(2)[Z=z], C->(1), D->(2)]') >>> fs1.unify(fs2) [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)] >>> fs2.unify(fs1) [A=(1)[X='x', Y='y', Z='z'], B->(1), C->(1), D->(1)] Cyclic structure created by unification. >>> fs1 = FeatStruct('[F=(1)[], G->(1)]') >>> fs2 = FeatStruct('[F=[H=(2)[]], G->(2)]') >>> fs3 = fs1.unify(fs2) >>> fs3 [F=(1)[H->(1)], G->(1)] >>> fs3['F'] is fs3['G'] True >>> fs3['F'] is fs3['G']['H'] True >>> fs3['F'] is fs3['G']['H']['H'] True >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H'] True Cyclic structure created w/ variables. >>> fs1 = FeatStruct('[F=[H=?x]]') >>> fs2 = FeatStruct('[F=?x]') >>> fs3 = fs1.unify(fs2, rename_vars=False) >>> fs3 [F=(1)[H->(1)]] >>> fs3['F'] is fs3['F']['H'] True >>> fs3['F'] is fs3['F']['H']['H'] True >>> fs3['F'] is fs3['F']['H']['H']['H']['H']['H']['H']['H']['H'] True Unifying w/ a cyclic feature structure. >>> fs4 = FeatStruct('[F=[H=[H=[H=(1)[]]]], K->(1)]') >>> fs3.unify(fs4) [F=(1)[H->(1)], K->(1)] >>> fs4.unify(fs3) [F=(1)[H->(1)], K->(1)] Variable bindings should preserve reentrance. >>> bindings = {} >>> fs1 = FeatStruct("[a=?x]") >>> fs2 = fs1.unify(FeatStruct("[a=[]]"), bindings) >>> fs2['a'] is bindings[Variable('?x')] True >>> fs2.unify(FeatStruct("[b=?x]"), bindings) [a=(1)[], b->(1)] Aliased variable tests >>> fs1 = FeatStruct("[a=?x, b=?x]") >>> fs2 = FeatStruct("[b=?y, c=?y]") >>> bindings = {} >>> fs3 = fs1.unify(fs2, bindings) >>> fs3 [a=?x, b=?x, c=?x] >>> bindings {Variable('?y'): Variable('?x')} >>> fs3.unify(FeatStruct("[a=1]")) [a=1, b=1, c=1] If we keep track of the bindings, then we can use the same variable over multiple calls to unify. >>> bindings = {} >>> fs1 = FeatStruct('[a=?x]') >>> fs2 = fs1.unify(FeatStruct('[a=[]]'), bindings) >>> fs2.unify(FeatStruct('[b=?x]'), bindings) [a=(1)[], b->(1)] >>> bindings {Variable('?x'): []} .. >>> del fs1, fs2, fs3, fs4, fs5 # clean-up Unification Bindings -------------------- >>> bindings = {} >>> fs1 = FeatStruct('[a=?x]') >>> fs2 = FeatStruct('[a=12]') >>> fs3 = FeatStruct('[b=?x]') >>> fs1.unify(fs2, bindings) [a=12] >>> bindings {Variable('?x'): 12} >>> fs3.substitute_bindings(bindings) [b=12] >>> fs3 # substitute_bindings didn't mutate fs3. [b=?x] >>> fs2.unify(fs3, bindings) [a=12, b=12] >>> bindings = {} >>> fs1 = FeatStruct('[a=?x, b=1]') >>> fs2 = FeatStruct('[a=5, b=?x]') >>> fs1.unify(fs2, bindings) [a=5, b=1] >>> sorted(bindings.items()) [(Variable('?x'), 5), (Variable('?x2'), 1)] .. >>> del fs1, fs2, fs3 # clean-up Expressions ----------- >>> e = Expression.fromstring('\\P y.P(z,y)') >>> fs1 = FeatStruct(x=e, y=Variable('z')) >>> fs2 = FeatStruct(y=VariableExpression(Variable('John'))) >>> fs1.unify(fs2) [x=<\P y.P(John,y)>, y=] Remove Variables ---------------- >>> FeatStruct('[a=?x, b=12, c=[d=?y]]').remove_variables() [b=12, c=[]] >>> FeatStruct('(1)[a=[b=?x,c->(1)]]').remove_variables() (1)[a=[c->(1)]] Equality & Hashing ------------------ The `equal_values` method checks whether two feature structures assign the same value to every feature. If the optional argument ``check_reentrances`` is supplied, then it also returns false if there is any difference in the reentrances. >>> a = FeatStruct('(1)[x->(1)]') >>> b = FeatStruct('(1)[x->(1)]') >>> c = FeatStruct('(1)[x=[x->(1)]]') >>> d = FeatStruct('[x=(1)[x->(1)]]') >>> e = FeatStruct('(1)[x=[x->(1), y=1], y=1]') >>> def compare(x,y): ... assert x.equal_values(y, True) == y.equal_values(x, True) ... assert x.equal_values(y, False) == y.equal_values(x, False) ... if x.equal_values(y, True): ... assert x.equal_values(y, False) ... print('equal values, same reentrance') ... elif x.equal_values(y, False): ... print('equal values, different reentrance') ... else: ... print('different values') >>> compare(a, a) equal values, same reentrance >>> compare(a, b) equal values, same reentrance >>> compare(a, c) equal values, different reentrance >>> compare(a, d) equal values, different reentrance >>> compare(c, d) equal values, different reentrance >>> compare(a, e) different values >>> compare(c, e) different values >>> compare(d, e) different values >>> compare(e, e) equal values, same reentrance Feature structures may not be hashed until they are frozen: >>> hash(a) Traceback (most recent call last): . . . TypeError: FeatStructs must be frozen before they can be hashed. >>> a.freeze() >>> v = hash(a) Feature structures define hash consistently. The following example looks at the hash value for each (fs1,fs2) pair; if their hash values are not equal, then they must not be equal. If their hash values are equal, then display a message, and indicate whether their values are indeed equal. Note that c and d currently have the same hash value, even though they are not equal. That is not a bug, strictly speaking, but it wouldn't be a bad thing if it changed. >>> for fstruct in (a, b, c, d, e): ... fstruct.freeze() >>> for fs1_name in 'abcde': ... for fs2_name in 'abcde': ... fs1 = locals()[fs1_name] ... fs2 = locals()[fs2_name] ... if hash(fs1) != hash(fs2): ... assert fs1 != fs2 ... else: ... print('%s and %s have the same hash value,' % ... (fs1_name, fs2_name)) ... if fs1 == fs2: print('and are equal') ... else: print('and are not equal') a and a have the same hash value, and are equal a and b have the same hash value, and are equal b and a have the same hash value, and are equal b and b have the same hash value, and are equal c and c have the same hash value, and are equal c and d have the same hash value, and are not equal d and c have the same hash value, and are not equal d and d have the same hash value, and are equal e and e have the same hash value, and are equal .. >>> del a, b, c, d, e, v # clean-up Tracing ------- >>> fs1 = FeatStruct('[a=[b=(1)[], c=?x], d->(1), e=[f=?x]]') >>> fs2 = FeatStruct('[a=(1)[c="C"], e=[g->(1)]]') >>> fs1.unify(fs2, trace=True) Unification trace: / [a=[b=(1)[], c=?x], d->(1), e=[f=?x]] |\ [a=(1)[c='C'], e=[g->(1)]] | | Unify feature: a | / [b=[], c=?x] | |\ [c='C'] | | | | Unify feature: a.c | | / ?x | | |\ 'C' | | | | | +-->Variable('?x') | | | +-->[b=[], c=?x] | Bindings: {?x: 'C'} | | Unify feature: e | / [f=?x] | |\ [g=[c='C']] | | | +-->[f=?x, g=[b=[], c=?x]] | Bindings: {?x: 'C'} | +-->[a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]] Bindings: {?x: 'C'} [a=(1)[b=(2)[], c='C'], d->(2), e=[f='C', g->(1)]] >>> >>> fs1 = FeatStruct('[a=?x, b=?z, c=?z]') >>> fs2 = FeatStruct('[a=?y, b=?y, c=?q]') >>> #fs1.unify(fs2, trace=True) >>> .. >>> del fs1, fs2 # clean-up Unification on Dicts & Lists ---------------------------- It's possible to do unification on dictionaries: >>> from nltk.featstruct import unify >>> pprint(unify(dict(x=1, y=dict(z=2)), dict(x=1, q=5)), width=1) {'q': 5, 'x': 1, 'y': {'z': 2}} It's possible to do unification on lists as well: >>> unify([1, 2, 3], [1, Variable('x'), 3]) [1, 2, 3] Mixing dicts and lists is fine: >>> pprint(unify([dict(x=1, y=dict(z=2)),3], [dict(x=1, q=5),3]), ... width=1) [{'q': 5, 'x': 1, 'y': {'z': 2}}, 3] Mixing dicts and FeatStructs is discouraged: >>> unify(dict(x=1), FeatStruct(x=1)) Traceback (most recent call last): . . . ValueError: Mixing FeatStruct objects with Python dicts and lists is not supported. But you can do it if you really want, by explicitly stating that both dictionaries and FeatStructs should be treated as feature structures: >>> unify(dict(x=1), FeatStruct(x=1), fs_class=(dict, FeatStruct)) {'x': 1} Finding Conflicts ----------------- >>> from nltk.featstruct import conflicts >>> fs1 = FeatStruct('[a=[b=(1)[c=2], d->(1), e=[f->(1)]]]') >>> fs2 = FeatStruct('[a=[b=[c=[x=5]], d=[c=2], e=[f=[c=3]]]]') >>> for path in conflicts(fs1, fs2): ... print('%-8s: %r vs %r' % ('.'.join(path), fs1[path], fs2[path])) a.b.c : 2 vs [x=5] a.e.f.c : 2 vs 3 .. >>> del fs1, fs2 # clean-up Retracting Bindings ------------------- >>> from nltk.featstruct import retract_bindings >>> bindings = {} >>> fs1 = FeatStruct('[a=?x, b=[c=?y]]') >>> fs2 = FeatStruct('[a=(1)[c=[d=1]], b->(1)]') >>> fs3 = fs1.unify(fs2, bindings) >>> print(fs3) [ a = (1) [ c = [ d = 1 ] ] ] [ ] [ b -> (1) ] >>> pprint(bindings) {Variable('?x'): [c=[d=1]], Variable('?y'): [d=1]} >>> retract_bindings(fs3, bindings) [a=?x, b=?x] >>> pprint(bindings) {Variable('?x'): [c=?y], Variable('?y'): [d=1]} Squashed Bugs ~~~~~~~~~~~~~ In svn rev 5167, unifying two feature structures that used the same variable would cause those variables to become aliased in the output. >>> fs1 = FeatStruct('[a=?x]') >>> fs2 = FeatStruct('[b=?x]') >>> fs1.unify(fs2) [a=?x, b=?x2] There was a bug in svn revision 5172 that caused `rename_variables` to rename variables to names that are already used. >>> FeatStruct('[a=?x, b=?x2]').rename_variables( ... vars=[Variable('?x')]) [a=?x3, b=?x2] >>> fs1 = FeatStruct('[a=?x]') >>> fs2 = FeatStruct('[a=?x, b=?x2]') >>> fs1.unify(fs2) [a=?x, b=?x2] There was a bug in svn rev 5167 that caused us to get the following example wrong. Basically the problem was that we only followed 'forward' pointers for other, not self, when unifying two feature structures. (nb: this test assumes that features are unified in alphabetical order -- if they are not, it might pass even if the bug is present.) >>> fs1 = FeatStruct('[a=[x=1], b=?x, c=?x]') >>> fs2 = FeatStruct('[a=(1)[], b->(1), c=[x=2]]') >>> print(fs1.unify(fs2)) None .. >>> del fs1, fs2 # clean-up nltk-3.1/nltk/test/framenet.doctest0000644000076500000240000002167112607224144017204 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ======== FrameNet ======== The FrameNet corpus is a lexical database of English that is both human- and machine-readable, based on annotating examples of how words are used in actual texts. FrameNet is based on a theory of meaning called Frame Semantics, deriving from the work of Charles J. Fillmore and colleagues. The basic idea is straightforward: that the meanings of most words can best be understood on the basis of a semantic frame: a description of a type of event, relation, or entity and the participants in it. For example, the concept of cooking typically involves a person doing the cooking (Cook), the food that is to be cooked (Food), something to hold the food while cooking (Container) and a source of heat (Heating_instrument). In the FrameNet project, this is represented as a frame called Apply_heat, and the Cook, Food, Heating_instrument and Container are called frame elements (FEs). Words that evoke this frame, such as fry, bake, boil, and broil, are called lexical units (LUs) of the Apply_heat frame. The job of FrameNet is to define the frames and to annotate sentences to show how the FEs fit syntactically around the word that evokes the frame. ------ Frames ------ A Frame is a script-like conceptual structure that describes a particular type of situation, object, or event along with the participants and props that are needed for that Frame. For example, the "Apply_heat" frame describes a common situation involving a Cook, some Food, and a Heating_Instrument, and is evoked by words such as bake, blanch, boil, broil, brown, simmer, steam, etc. We call the roles of a Frame "frame elements" (FEs) and the frame-evoking words are called "lexical units" (LUs). FrameNet includes relations between Frames. Several types of relations are defined, of which the most important are: - Inheritance: An IS-A relation. The child frame is a subtype of the parent frame, and each FE in the parent is bound to a corresponding FE in the child. An example is the "Revenge" frame which inherits from the "Rewards_and_punishments" frame. - Using: The child frame presupposes the parent frame as background, e.g the "Speed" frame "uses" (or presupposes) the "Motion" frame; however, not all parent FEs need to be bound to child FEs. - Subframe: The child frame is a subevent of a complex event represented by the parent, e.g. the "Criminal_process" frame has subframes of "Arrest", "Arraignment", "Trial", and "Sentencing". - Perspective_on: The child frame provides a particular perspective on an un-perspectivized parent frame. A pair of examples consists of the "Hiring" and "Get_a_job" frames, which perspectivize the "Employment_start" frame from the Employer's and the Employee's point of view, respectively. To get a list of all of the Frames in FrameNet, you can use the `frames()` function. If you supply a regular expression pattern to the `frames()` function, you will get a list of all Frames whose names match that pattern: >>> from pprint import pprint >>> from nltk.corpus import framenet as fn >>> len(fn.frames()) 1019 >>> pprint(fn.frames(r'(?i)medical')) [, , ...] To get the details of a particular Frame, you can use the `frame()` function passing in the frame number: >>> from pprint import pprint >>> from nltk.corpus import framenet as fn >>> f = fn.frame(256) >>> f.ID 256 >>> f.name 'Medical_specialties' >>> f.definition # doctest: +ELLIPSIS "This frame includes words that name ..." >>> len(f.lexUnit) 29 >>> pprint(sorted([x for x in f.FE])) ['Affliction', 'Body_system', 'Specialty', 'Type'] >>> pprint(f.frameRelations) [ Child=Medical_specialties>] The `frame()` function shown above returns a dict object containing detailed information about the Frame. See the documentation on the `frame()` function for the specifics. You can also search for Frames by their Lexical Units (LUs). The `frames_by_lemma()` function returns a list of all frames that contain LUs in which the 'name' attribute of the LU matchs the given regular expression. Note that LU names are composed of "lemma.POS", where the "lemma" part can be made up of either a single lexeme (e.g. 'run') or multiple lexemes (e.g. 'a little') (see below). >>> from nltk.corpus import framenet as fn >>> fn.frames_by_lemma(r'(?i)a little') [, ] ------------- Lexical Units ------------- A lexical unit (LU) is a pairing of a word with a meaning. For example, the "Apply_heat" Frame describes a common situation involving a Cook, some Food, and a Heating Instrument, and is _evoked_ by words such as bake, blanch, boil, broil, brown, simmer, steam, etc. These frame-evoking words are the LUs in the Apply_heat frame. Each sense of a polysemous word is a different LU. We have used the word "word" in talking about LUs. The reality is actually rather complex. When we say that the word "bake" is polysemous, we mean that the lemma "bake.v" (which has the word-forms "bake", "bakes", "baked", and "baking") is linked to three different frames: - Apply_heat: "Michelle baked the potatoes for 45 minutes." - Cooking_creation: "Michelle baked her mother a cake for her birthday." - Absorb_heat: "The potatoes have to bake for more than 30 minutes." These constitute three different LUs, with different definitions. Multiword expressions such as "given name" and hyphenated words like "shut-eye" can also be LUs. Idiomatic phrases such as "middle of nowhere" and "give the slip (to)" are also defined as LUs in the appropriate frames ("Isolated_places" and "Evading", respectively), and their internal structure is not analyzed. Framenet provides multiple annotated examples of each sense of a word (i.e. each LU). Moreover, the set of examples (approximately 20 per LU) illustrates all of the combinatorial possibilities of the lexical unit. Each LU is linked to a Frame, and hence to the other words which evoke that Frame. This makes the FrameNet database similar to a thesaurus, grouping together semantically similar words. In the simplest case, frame-evoking words are verbs such as "fried" in: "Matilde fried the catfish in a heavy iron skillet." Sometimes event nouns may evoke a Frame. For example, "reduction" evokes "Cause_change_of_scalar_position" in: "...the reduction of debt levels to $665 million from $2.6 billion." Adjectives may also evoke a Frame. For example, "asleep" may evoke the "Sleep" frame as in: "They were asleep for hours." Many common nouns, such as artifacts like "hat" or "tower", typically serve as dependents rather than clearly evoking their own frames. Details for a specific lexical unit can be obtained using this class's `lus()` function, which takes an optional regular expression pattern that will be matched against the name of the lexical unit: >>> from pprint import pprint >>> from nltk.corpus import framenet as fn >>> len(fn.lus()) 11829 >>> pprint(fn.lus(r'(?i)a little')) [, , ...] You can obtain detailed information on a particular LU by calling the `lu()` function and passing in an LU's 'ID' number: >>> from pprint import pprint >>> from nltk.corpus import framenet as fn >>> fn.lu(256).name 'foresee.v' >>> fn.lu(256).definition 'COD: be aware of beforehand; predict.' >>> fn.lu(256).frame.name 'Expectation' >>> fn.lu(256).lexemes[0].name 'foresee' Note that LU names take the form of a dotted string (e.g. "run.v" or "a little.adv") in which a lemma preceeds the "." and a part of speech (POS) follows the dot. The lemma may be composed of a single lexeme (e.g. "run") or of multiple lexemes (e.g. "a little"). The list of POSs used in the LUs is: v - verb n - noun a - adjective adv - adverb prep - preposition num - numbers intj - interjection art - article c - conjunction scon - subordinating conjunction For more detailed information about the info that is contained in the dict that is returned by the `lu()` function, see the documentation on the `lu()` function. ------------------- Annotated Documents ------------------- The FrameNet corpus contains a small set of annotated documents. A list of these documents can be obtained by calling the `documents()` function: >>> from pprint import pprint >>> from nltk.corpus import framenet as fn >>> docs = fn.documents() >>> len(docs) 78 >>> pprint(sorted(docs[0].keys())) ['ID', 'corpid', 'corpname', 'description', 'filename'] Detailed information about each sentence contained in each document can be obtained by calling the `annotated_document()` function and supplying the 'ID' number of the document. For detailed information about the info that is for each document, see the documentation on the `annotated_document()` function. nltk-3.1/nltk/test/generate.doctest0000644000076500000240000000314212607224144017166 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT =============================================== Generating sentences from context-free grammars =============================================== An example grammar: >>> from nltk.parse.generate import generate, demo_grammar >>> from nltk import CFG >>> grammar = CFG.fromstring(demo_grammar) >>> print(grammar) Grammar with 13 productions (start state = S) S -> NP VP NP -> Det N PP -> P NP VP -> 'slept' VP -> 'saw' NP VP -> 'walked' PP Det -> 'the' Det -> 'a' N -> 'man' N -> 'park' N -> 'dog' P -> 'in' P -> 'with' The first 10 generated sentences: >>> for sentence in generate(grammar, n=10): ... print(' '.join(sentence)) the man slept the man saw the man the man saw the park the man saw the dog the man saw a man the man saw a park the man saw a dog the man walked in the man the man walked in the park the man walked in the dog All sentences of max depth 4: >>> for sentence in generate(grammar, depth=4): ... print(' '.join(sentence)) the man slept the park slept the dog slept a man slept a park slept a dog slept The number of sentences of different max depths: >>> len(list(generate(grammar, depth=3))) 0 >>> len(list(generate(grammar, depth=4))) 6 >>> len(list(generate(grammar, depth=5))) 42 >>> len(list(generate(grammar, depth=6))) 114 >>> len(list(generate(grammar))) 114 nltk-3.1/nltk/test/gensim.doctest0000644000076500000240000001155312607224144016663 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ======================================= Demonstrate word embedding using Gensim ======================================= We demonstrate three functions: - Train the word embeddings using brown corpus; - Load the pre-trained model and perform simple tasks; and - Pruning the pre-trained binary model. >>> import gensim --------------- Train the model --------------- Here we train a word embedding using the Brown Corpus: >>> from nltk.corpus import brown >>> model = gensim.models.Word2Vec(brown.sents()) It might take some time to train the model. So, after it is trained, it can be saved as follows: >>> model.save('brown.embedding') >>> new_model = gensim.models.Word2Vec.load('brown.embedding') The model will be the list of words with their embedding. We can easily get the vector representation of a word. >>> len(new_model['university']) 100 There are some supporting functions already implemented in Gensim to manipulate with word embeddings. For example, to compute the cosine similarity between 2 words: >>> new_model.similarity('university','school') > 0.3 True --------------------------- Using the pre-trained model --------------------------- NLTK includes a pre-trained model which is part of a model that is trained on 100 billion words from the Google News Dataset. The full model is from https://code.google.com/p/word2vec/ (about 3 GB). >>> from nltk.data import find >>> word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt')) >>> model = gensim.models.Word2Vec.load_word2vec_format(word2vec_sample, binary=False) We pruned the model to only include the most common words (~44k words). >>> len(model.vocab) 43981 Each word is represented in the space of 300 dimensions: >>> len(model['university']) 300 Finding the top n words that are similar to a target word is simple. The result is the list of n words with the score. >>> model.most_similar(positive=['university'], topn = 3) [(u'universities', 0.70039...), (u'faculty', 0.67809...), (u'undergraduate', 0.65870...)] Finding a word that is not in a list is also supported, although, implementing this by yourself is simple. >>> model.doesnt_match('breakfast cereal dinner lunch'.split()) 'cereal' Mikolov et al. (2013) figured out that word embedding captures much of syntactic and semantic regularities. For example, the vector 'King - Man + Woman' is close to 'Queen' and 'Germany - Berlin + Paris' is close to 'France'. >>> model.most_similar(positive=['woman','king'], negative=['man'], topn = 1) [(u'queen', 0.71181...)] >>> model.most_similar(positive=['Paris','Germany'], negative=['Berlin'], topn = 1) [(u'France', 0.78840...)] We can visualize the word embeddings using t-SNE (http://lvdmaaten.github.io/tsne/). For this demonstration, we visualize the first 1000 words. | import numpy as np | labels = [] | count = 0 | max_count = 1000 | X = np.zeros(shape=(max_count,len(model['university']))) | | for term in model.vocab: | X[count] = model[term] | labels.append(term) | count+= 1 | if count >= max_count: break | | # It is recommended to use PCA first to reduce to ~50 dimensions | from sklearn.decomposition import PCA | pca = PCA(n_components=50) | X_50 = pca.fit_transform(X) | | # Using TSNE to further reduce to 2 dimensions | from sklearn.manifold import TSNE | model_tsne = TSNE(n_components=2, random_state=0) | Y = model_tsne.fit_transform(X_50) | | # Show the scatter plot | import matplotlib.pyplot as plt | plt.scatter(Y[:,0], Y[:,1], 20) | | # Add labels | for label, x, y in zip(labels, Y[:, 0], Y[:, 1]): | plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10) | | plt.show() ------------------------------ Prune the trained binary model ------------------------------ Here is the supporting code to extract part of the binary model (GoogleNews-vectors-negative300.bin.gz) from https://code.google.com/p/word2vec/ We use this code to get the `word2vec_sample` model. | import gensim | from gensim.models.word2vec import Word2Vec | # Load the binary model | model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary = True); | | # Only output word that appear in the Brown corpus | from nltk.corpus import brown | words = set(brown.words()) | print (len(words)) | | # Output presented word to a temporary file | out_file = 'pruned.word2vec.txt' | f = open(out_file,'wb') | | word_presented = words.intersection(model.vocab.keys()) | f.write('{} {}\n'.format(len(word_presented),len(model['word']))) | | for word in word_presented: | f.write('{} {}\n'.format(word, ' '.join(str(value) for value in model[word]))) | | f.close() nltk-3.1/nltk/test/gensim_fixt.py0000644000076500000240000000035012607224144016671 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import def setup_module(module): from nose import SkipTest try: import gensim except ImportError: raise SkipTest("Gensim doctest requires gensim") nltk-3.1/nltk/test/gluesemantics.doctest0000644000076500000240000003012412607224144020237 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ============================================================================== Glue Semantics ============================================================================== .. include:: ../../../nltk_book/definitions.rst ====================== Linear logic ====================== >>> from nltk.sem import logic >>> from nltk.sem.glue import * >>> from nltk.sem.linearlogic import * >>> from nltk.sem.linearlogic import Expression >>> read_expr = Expression.fromstring Parser >>> print(read_expr(r'f')) f >>> print(read_expr(r'(g -o f)')) (g -o f) >>> print(read_expr(r'(g -o (h -o f))')) (g -o (h -o f)) >>> print(read_expr(r'((g -o G) -o G)')) ((g -o G) -o G) >>> print(read_expr(r'(g -o f)(g)')) (g -o f)(g) >>> print(read_expr(r'((g -o G) -o G)((g -o f))')) ((g -o G) -o G)((g -o f)) Simplify >>> print(read_expr(r'f').simplify()) f >>> print(read_expr(r'(g -o f)').simplify()) (g -o f) >>> print(read_expr(r'((g -o G) -o G)').simplify()) ((g -o G) -o G) >>> print(read_expr(r'(g -o f)(g)').simplify()) f >>> try: read_expr(r'(g -o f)(f)').simplify() ... except LinearLogicApplicationException as e: print(e) ... Cannot apply (g -o f) to f. Cannot unify g with f given {} >>> print(read_expr(r'(G -o f)(g)').simplify()) f >>> print(read_expr(r'((g -o G) -o G)((g -o f))').simplify()) f Test BindingDict >>> h = ConstantExpression('h') >>> g = ConstantExpression('g') >>> f = ConstantExpression('f') >>> H = VariableExpression('H') >>> G = VariableExpression('G') >>> F = VariableExpression('F') >>> d1 = BindingDict({H: h}) >>> d2 = BindingDict({F: f, G: F}) >>> d12 = d1 + d2 >>> all12 = ['%s: %s' % (v, d12[v]) for v in d12.d] >>> all12.sort() >>> print(all12) ['F: f', 'G: f', 'H: h'] >>> BindingDict([(F,f),(G,g),(H,h)]) == BindingDict({F:f, G:g, H:h}) True >>> d4 = BindingDict({F: f}) >>> try: d4[F] = g ... except VariableBindingException as e: print(e) Variable F already bound to another value Test Unify >>> try: f.unify(g, BindingDict()) ... except UnificationException as e: print(e) ... Cannot unify f with g given {} >>> f.unify(G, BindingDict()) == BindingDict({G: f}) True >>> try: f.unify(G, BindingDict({G: h})) ... except UnificationException as e: print(e) ... Cannot unify f with G given {G: h} >>> f.unify(G, BindingDict({G: f})) == BindingDict({G: f}) True >>> f.unify(G, BindingDict({H: f})) == BindingDict({G: f, H: f}) True >>> G.unify(f, BindingDict()) == BindingDict({G: f}) True >>> try: G.unify(f, BindingDict({G: h})) ... except UnificationException as e: print(e) ... Cannot unify G with f given {G: h} >>> G.unify(f, BindingDict({G: f})) == BindingDict({G: f}) True >>> G.unify(f, BindingDict({H: f})) == BindingDict({G: f, H: f}) True >>> G.unify(F, BindingDict()) == BindingDict({G: F}) True >>> try: G.unify(F, BindingDict({G: H})) ... except UnificationException as e: print(e) ... Cannot unify G with F given {G: H} >>> G.unify(F, BindingDict({G: F})) == BindingDict({G: F}) True >>> G.unify(F, BindingDict({H: F})) == BindingDict({G: F, H: F}) True Test Compile >>> print(read_expr('g').compile_pos(Counter(), GlueFormula)) (, []) >>> print(read_expr('(g -o f)').compile_pos(Counter(), GlueFormula)) (, []) >>> print(read_expr('(g -o (h -o f))').compile_pos(Counter(), GlueFormula)) (, []) ====================== Glue ====================== Demo of "John walks" -------------------- >>> john = GlueFormula("John", "g") >>> print(john) John : g >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)") >>> print(walks) \x.walks(x) : (g -o f) >>> print(walks.applyto(john)) \x.walks(x)(John) : (g -o f)(g) >>> print(walks.applyto(john).simplify()) walks(John) : f Demo of "A dog walks" --------------------- >>> a = GlueFormula("\P Q.some x.(P(x) and Q(x))", "((gv -o gr) -o ((g -o G) -o G))") >>> print(a) \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) >>> man = GlueFormula(r"\x.man(x)", "(gv -o gr)") >>> print(man) \x.man(x) : (gv -o gr) >>> walks = GlueFormula(r"\x.walks(x)", "(g -o f)") >>> print(walks) \x.walks(x) : (g -o f) >>> a_man = a.applyto(man) >>> print(a_man.simplify()) \Q.exists x.(man(x) & Q(x)) : ((g -o G) -o G) >>> a_man_walks = a_man.applyto(walks) >>> print(a_man_walks.simplify()) exists x.(man(x) & walks(x)) : f Demo of 'every girl chases a dog' --------------------------------- Individual words: >>> every = GlueFormula("\P Q.all x.(P(x) -> Q(x))", "((gv -o gr) -o ((g -o G) -o G))") >>> print(every) \P Q.all x.(P(x) -> Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) >>> girl = GlueFormula(r"\x.girl(x)", "(gv -o gr)") >>> print(girl) \x.girl(x) : (gv -o gr) >>> chases = GlueFormula(r"\x y.chases(x,y)", "(g -o (h -o f))") >>> print(chases) \x y.chases(x,y) : (g -o (h -o f)) >>> a = GlueFormula("\P Q.some x.(P(x) and Q(x))", "((hv -o hr) -o ((h -o H) -o H))") >>> print(a) \P Q.exists x.(P(x) & Q(x)) : ((hv -o hr) -o ((h -o H) -o H)) >>> dog = GlueFormula(r"\x.dog(x)", "(hv -o hr)") >>> print(dog) \x.dog(x) : (hv -o hr) Noun Quantification can only be done one way: >>> every_girl = every.applyto(girl) >>> print(every_girl.simplify()) \Q.all x.(girl(x) -> Q(x)) : ((g -o G) -o G) >>> a_dog = a.applyto(dog) >>> print(a_dog.simplify()) \Q.exists x.(dog(x) & Q(x)) : ((h -o H) -o H) The first reading is achieved by combining 'chases' with 'a dog' first. Since 'a girl' requires something of the form '(h -o H)' we must get rid of the 'g' in the glue of 'see'. We will do this with the '-o elimination' rule. So, x1 will be our subject placeholder. >>> xPrime = GlueFormula("x1", "g") >>> print(xPrime) x1 : g >>> xPrime_chases = chases.applyto(xPrime) >>> print(xPrime_chases.simplify()) \y.chases(x1,y) : (h -o f) >>> xPrime_chases_a_dog = a_dog.applyto(xPrime_chases) >>> print(xPrime_chases_a_dog.simplify()) exists x.(dog(x) & chases(x1,x)) : f Now we can retract our subject placeholder using lambda-abstraction and combine with the true subject. >>> chases_a_dog = xPrime_chases_a_dog.lambda_abstract(xPrime) >>> print(chases_a_dog.simplify()) \x1.exists x.(dog(x) & chases(x1,x)) : (g -o f) >>> every_girl_chases_a_dog = every_girl.applyto(chases_a_dog) >>> r1 = every_girl_chases_a_dog.simplify() >>> r2 = GlueFormula(r'all x.(girl(x) -> exists z1.(dog(z1) & chases(x,z1)))', 'f') >>> r1 == r2 True The second reading is achieved by combining 'every girl' with 'chases' first. >>> xPrime = GlueFormula("x1", "g") >>> print(xPrime) x1 : g >>> xPrime_chases = chases.applyto(xPrime) >>> print(xPrime_chases.simplify()) \y.chases(x1,y) : (h -o f) >>> yPrime = GlueFormula("x2", "h") >>> print(yPrime) x2 : h >>> xPrime_chases_yPrime = xPrime_chases.applyto(yPrime) >>> print(xPrime_chases_yPrime.simplify()) chases(x1,x2) : f >>> chases_yPrime = xPrime_chases_yPrime.lambda_abstract(xPrime) >>> print(chases_yPrime.simplify()) \x1.chases(x1,x2) : (g -o f) >>> every_girl_chases_yPrime = every_girl.applyto(chases_yPrime) >>> print(every_girl_chases_yPrime.simplify()) all x.(girl(x) -> chases(x,x2)) : f >>> every_girl_chases = every_girl_chases_yPrime.lambda_abstract(yPrime) >>> print(every_girl_chases.simplify()) \x2.all x.(girl(x) -> chases(x,x2)) : (h -o f) >>> every_girl_chases_a_dog = a_dog.applyto(every_girl_chases) >>> r1 = every_girl_chases_a_dog.simplify() >>> r2 = GlueFormula(r'exists x.(dog(x) & all z2.(girl(z2) -> chases(z2,x)))', 'f') >>> r1 == r2 True Compilation ----------- >>> for cp in GlueFormula('m', '(b -o a)').compile(Counter()): print(cp) m : (b -o a) : {1} >>> for cp in GlueFormula('m', '((c -o b) -o a)').compile(Counter()): print(cp) v1 : c : {1} m : (b[1] -o a) : {2} >>> for cp in GlueFormula('m', '((d -o (c -o b)) -o a)').compile(Counter()): print(cp) v1 : c : {1} v2 : d : {2} m : (b[1, 2] -o a) : {3} >>> for cp in GlueFormula('m', '((d -o e) -o ((c -o b) -o a))').compile(Counter()): print(cp) v1 : d : {1} v2 : c : {2} m : (e[1] -o (b[2] -o a)) : {3} >>> for cp in GlueFormula('m', '(((d -o c) -o b) -o a)').compile(Counter()): print(cp) v1 : (d -o c) : {1} m : (b[1] -o a) : {2} >>> for cp in GlueFormula('m', '((((e -o d) -o c) -o b) -o a)').compile(Counter()): print(cp) v1 : e : {1} v2 : (d[1] -o c) : {2} m : (b[2] -o a) : {3} Demo of 'a man walks' using Compilation --------------------------------------- Premises >>> a = GlueFormula('\\P Q.some x.(P(x) and Q(x))', '((gv -o gr) -o ((g -o G) -o G))') >>> print(a) \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G) -o G)) >>> man = GlueFormula('\\x.man(x)', '(gv -o gr)') >>> print(man) \x.man(x) : (gv -o gr) >>> walks = GlueFormula('\\x.walks(x)', '(g -o f)') >>> print(walks) \x.walks(x) : (g -o f) Compiled Premises: >>> counter = Counter() >>> ahc = a.compile(counter) >>> g1 = ahc[0] >>> print(g1) v1 : gv : {1} >>> g2 = ahc[1] >>> print(g2) v2 : g : {2} >>> g3 = ahc[2] >>> print(g3) \P Q.exists x.(P(x) & Q(x)) : (gr[1] -o (G[2] -o G)) : {3} >>> g4 = man.compile(counter)[0] >>> print(g4) \x.man(x) : (gv -o gr) : {4} >>> g5 = walks.compile(counter)[0] >>> print(g5) \x.walks(x) : (g -o f) : {5} Derivation: >>> g14 = g4.applyto(g1) >>> print(g14.simplify()) man(v1) : gr : {1, 4} >>> g134 = g3.applyto(g14) >>> print(g134.simplify()) \Q.exists x.(man(x) & Q(x)) : (G[2] -o G) : {1, 3, 4} >>> g25 = g5.applyto(g2) >>> print(g25.simplify()) walks(v2) : f : {2, 5} >>> g12345 = g134.applyto(g25) >>> print(g12345.simplify()) exists x.(man(x) & walks(x)) : f : {1, 2, 3, 4, 5} --------------------------------- Dependency Graph to Glue Formulas --------------------------------- >>> from nltk.corpus.reader.dependency import DependencyGraph >>> depgraph = DependencyGraph("""1 John _ NNP NNP _ 2 SUBJ _ _ ... 2 sees _ VB VB _ 0 ROOT _ _ ... 3 a _ ex_quant ex_quant _ 4 SPEC _ _ ... 4 dog _ NN NN _ 2 OBJ _ _ ... """) >>> gfl = GlueDict('nltk:grammars/sample_grammars/glue.semtype').to_glueformula_list(depgraph) >>> for gf in gfl: ... print(gf) \x y.sees(x,y) : (f -o (i -o g)) \P Q.exists x.(P(x) & Q(x)) : ((fv -o fr) -o ((f -o F2) -o F2)) \x.John(x) : (fv -o fr) \x.dog(x) : (iv -o ir) \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I5) -o I5)) >>> glue = Glue() >>> for r in sorted([r.simplify().normalize() for r in glue.get_readings(glue.gfl_to_compiled(gfl))], key=str): ... print(r) exists z1.(John(z1) & exists z2.(dog(z2) & sees(z1,z2))) exists z1.(dog(z1) & exists z2.(John(z2) & sees(z2,z1))) ----------------------------------- Dependency Graph to LFG f-structure ----------------------------------- >>> from nltk.sem.lfg import FStructure >>> fstruct = FStructure.read_depgraph(depgraph) >>> print(fstruct) f:[pred 'sees' obj h:[pred 'dog' spec 'a'] subj g:[pred 'John']] >>> fstruct.to_depgraph().tree().pprint() (sees (dog a) John) --------------------------------- LFG f-structure to Glue --------------------------------- >>> for gf in fstruct.to_glueformula_list(GlueDict('nltk:grammars/sample_grammars/glue.semtype')): # doctest: +SKIP ... print(gf) \x y.sees(x,y) : (i -o (g -o f)) \x.dog(x) : (gv -o gr) \P Q.exists x.(P(x) & Q(x)) : ((gv -o gr) -o ((g -o G3) -o G3)) \P Q.exists x.(P(x) & Q(x)) : ((iv -o ir) -o ((i -o I4) -o I4)) \x.John(x) : (iv -o ir) .. see gluesemantics_malt.doctest for more nltk-3.1/nltk/test/gluesemantics_malt.doctest0000644000076500000240000000471112607224144021257 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT .. see also: gluesemantics.doctest ============================================================================== Glue Semantics ============================================================================== >>> from nltk.sem.glue import * >>> nltk.sem.logic._counter._value = 0 -------------------------------- Initialize the Dependency Parser -------------------------------- >>> from nltk.parse.malt import MaltParser >>> tagger = RegexpTagger( ... [('^(John|Mary)$', 'NNP'), ... ('^(sees|chases)$', 'VB'), ... ('^(a)$', 'ex_quant'), ... ('^(every)$', 'univ_quant'), ... ('^(girl|dog)$', 'NN') ... ]) >>> depparser = MaltParser(tagger=tagger) -------------------- Automated Derivation -------------------- >>> glue = Glue(depparser=depparser) >>> readings = glue.parse_to_meaning('every girl chases a dog'.split()) >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str): ... print(reading.normalize()) all z1.(girl(z1) -> exists z2.(dog(z2) & chases(z1,z2))) exists z1.(dog(z1) & all z2.(girl(z2) -> chases(z2,z1))) >>> drtglue = DrtGlue(depparser=depparser) >>> readings = drtglue.parse_to_meaning('every girl chases a dog'.split()) >>> for reading in sorted([r.simplify().normalize() for r in readings], key=str): ... print(reading) ([],[(([z1],[girl(z1)]) -> ([z2],[dog(z2), chases(z1,z2)]))]) ([z1],[dog(z1), (([z2],[girl(z2)]) -> ([],[chases(z2,z1)]))]) -------------- With inference -------------- Checking for equality of two DRSs is very useful when generating readings of a sentence. For example, the ``glue`` module generates two readings for the sentence *John sees Mary*: >>> from nltk.sem.glue import DrtGlue >>> readings = drtglue.parse_to_meaning('John sees Mary'.split()) >>> for drs in sorted([r.simplify().normalize() for r in readings], key=str): ... print(drs) ([z1,z2],[John(z1), Mary(z2), sees(z1,z2)]) ([z1,z2],[Mary(z1), John(z2), sees(z2,z1)]) However, it is easy to tell that these two readings are logically the same, and therefore one of them is superfluous. We can use the theorem prover to determine this equivalence, and then delete one of them. A particular theorem prover may be specified, or the argument may be left off to use the default. >>> readings[0].equiv(readings[1]) True nltk-3.1/nltk/test/gluesemantics_malt_fixt.py0000644000076500000240000000045612607224144021276 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import def setup_module(module): from nose import SkipTest from nltk.parse.malt import MaltParser try: depparser = MaltParser('maltparser-1.7.2') except LookupError: raise SkipTest("MaltParser is not available") nltk-3.1/nltk/test/grammar.doctest0000644000076500000240000000254612607224144017031 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT =============== Grammar Parsing =============== Grammars can be parsed from strings: >>> from nltk import CFG >>> grammar = CFG.fromstring(""" ... S -> NP VP ... PP -> P NP ... NP -> Det N | NP PP ... VP -> V NP | VP PP ... Det -> 'a' | 'the' ... N -> 'dog' | 'cat' ... V -> 'chased' | 'sat' ... P -> 'on' | 'in' ... """) >>> grammar >>> grammar.start() S >>> grammar.productions() # doctest: +NORMALIZE_WHITESPACE [S -> NP VP, PP -> P NP, NP -> Det N, NP -> NP PP, VP -> V NP, VP -> VP PP, Det -> 'a', Det -> 'the', N -> 'dog', N -> 'cat', V -> 'chased', V -> 'sat', P -> 'on', P -> 'in'] Probabilistic CFGs: >>> from nltk import PCFG >>> toy_pcfg1 = PCFG.fromstring(""" ... S -> NP VP [1.0] ... NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15] ... Det -> 'the' [0.8] | 'my' [0.2] ... N -> 'man' [0.5] | 'telescope' [0.5] ... VP -> VP PP [0.1] | V NP [0.7] | V [0.2] ... V -> 'ate' [0.35] | 'saw' [0.65] ... PP -> P NP [1.0] ... P -> 'with' [0.61] | 'under' [0.39] ... """) Chomsky Normal Form grammar (Test for bug 474) >>> g = CFG.fromstring("VP^ -> VBP NP^") >>> g.productions()[0].lhs() VP^ nltk-3.1/nltk/test/grammartestsuites.doctest0000644000076500000240000000620412607224144021161 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ========================== Test Suites for Grammars ========================== Sentences in the test suite are divided into two classes: - grammatical (*accept*) and - ungrammatical (*reject*). If a sentence should parse accordng to the grammar, the value of ``trees`` will be a non-empty list. If a sentence should be rejected according to the grammar, then the value of ``trees`` will be ``None``. >>> from nltk.parse import TestGrammar >>> germantest1 = {} >>> germantest1['doc'] = "Tests for person agreement" >>> germantest1['accept'] = [ ... 'ich komme', ... 'ich sehe mich', ... 'du kommst', ... 'du siehst mich', ... 'sie kommt', ... 'sie sieht mich', ... 'ihr kommt', ... 'wir kommen', ... 'sie kommen', ... 'du magst mich', ... 'er mag mich', ... 'du folgst mir', ... 'sie hilft mir', ... ] >>> germantest1['reject'] = [ ... 'ich kommt', ... 'ich kommst', ... 'ich siehst mich', ... 'du komme', ... 'du sehe mich', ... 'du kommt', ... 'er komme', ... 'er siehst mich', ... 'wir komme', ... 'wir kommst', ... 'die Katzen kommst', ... 'sie komme', ... 'sie kommst', ... 'du mag mich', ... 'er magst mich', ... 'du folgt mir', ... 'sie hilfst mir', ... ] >>> germantest2 = {} >>> germantest2['doc'] = "Tests for number agreement" >>> germantest2['accept'] = [ ... 'der Hund kommt', ... 'die Hunde kommen', ... 'ich komme', ... 'wir kommen', ... 'ich sehe die Katzen', ... 'ich folge den Katzen', ... 'ich sehe die Katzen', ... 'ich folge den Katzen', ... 'wir sehen die Katzen', ... 'wir folgen den Katzen' ... ] >>> germantest2['reject'] = [ ... 'ich kommen', ... 'wir komme', ... 'der Hunde kommt', ... 'der Hunde kommen', ... 'die Katzen kommt', ... 'ich sehe der Hunde', ... 'ich folge den Hund', ... 'ich sehen der Hunde', ... 'ich folgen den Hund', ... 'wir sehe die Katzen', ... 'wir folge den Katzen' ... ] >>> germantest3 = {} >>> germantest3['doc'] = "Tests for case government and subcategorization" >>> germantest3['accept'] = [ ... 'der Hund sieht mich', ... 'der Hund kommt', ... 'ich sehe den Hund', ... 'ich helfe dem Hund', ... ] >>> germantest3['reject'] = [ ... 'ich sehe', ... 'ich helfe', ... 'ich komme den Hund', ... 'ich sehe den Hund die Katzen', ... 'du hilfst mich', ... 'du siehst mir', ... 'du siehst ich', ... 'der Hunde kommt mich', ... 'die Hunde sehe die Hunde', ... 'der Hund sehe die Hunde', ... 'ich hilft den Hund', ... 'ich hilft der Hund', ... 'ich sehe dem Hund', ... ] >>> germantestsuites = [germantest1, germantest2, germantest3] >>> tester = TestGrammar('grammars/book_grammars/german.fcfg', germantestsuites) >>> tester.run() Tests for person agreement: All tests passed! Tests for number agreement: All tests passed! Tests for case government and subcategorization: All tests passed! nltk-3.1/nltk/test/index.doctest0000644000076500000240000000505112607224144016504 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT .. _align howto: align.html .. _ccg howto: ccg.html .. _chat80 howto: chat80.html .. _childes howto: childes.html .. _chunk howto: chunk.html .. _classify howto: classify.html .. _collocations howto: collocations.html .. _compat howto: compat.html .. _corpus howto: corpus.html .. _data howto: data.html .. _dependency howto: dependency.html .. _discourse howto: discourse.html .. _drt howto: drt.html .. _featgram howto: featgram.html .. _featstruct howto: featstruct.html .. _framenet howto: framenet.html .. _generate howto: generate.html .. _gluesemantics howto: gluesemantics.html .. _gluesemantics_malt howto: gluesemantics_malt.html .. _grammar howto: grammar.html .. _grammartestsuites howto: grammartestsuites.html .. _index howto: index.html .. _inference howto: inference.html .. _internals howto: internals.html .. _japanese howto: japanese.html .. _logic howto: logic.html .. _metrics howto: metrics.html .. _misc howto: misc.html .. _nonmonotonic howto: nonmonotonic.html .. _parse howto: parse.html .. _portuguese_en howto: portuguese_en.html .. _probability howto: probability.html .. _propbank howto: propbank.html .. _relextract howto: relextract.html .. _resolution howto: resolution.html .. _semantics howto: semantics.html .. _simple howto: simple.html .. _stem howto: stem.html .. _tag howto: tag.html .. _tokenize howto: tokenize.html .. _toolbox howto: toolbox.html .. _tree howto: tree.html .. _treetransforms howto: treetransforms.html .. _util howto: util.html .. _wordnet howto: wordnet.html .. _wordnet_lch howto: wordnet_lch.html =========== NLTK HOWTOs =========== * `align HOWTO`_ * `ccg HOWTO`_ * `chat80 HOWTO`_ * `childes HOWTO`_ * `chunk HOWTO`_ * `classify HOWTO`_ * `collocations HOWTO`_ * `compat HOWTO`_ * `corpus HOWTO`_ * `data HOWTO`_ * `dependency HOWTO`_ * `discourse HOWTO`_ * `drt HOWTO`_ * `featgram HOWTO`_ * `featstruct HOWTO`_ * `framenet HOWTO`_ * `generate HOWTO`_ * `gluesemantics HOWTO`_ * `gluesemantics_malt HOWTO`_ * `grammar HOWTO`_ * `grammartestsuites HOWTO`_ * `index HOWTO`_ * `inference HOWTO`_ * `internals HOWTO`_ * `japanese HOWTO`_ * `logic HOWTO`_ * `metrics HOWTO`_ * `misc HOWTO`_ * `nonmonotonic HOWTO`_ * `parse HOWTO`_ * `portuguese_en HOWTO`_ * `probability HOWTO`_ * `propbank HOWTO`_ * `relextract HOWTO`_ * `resolution HOWTO`_ * `semantics HOWTO`_ * `simple HOWTO`_ * `stem HOWTO`_ * `tag HOWTO`_ * `tokenize HOWTO`_ * `toolbox HOWTO`_ * `tree HOWTO`_ * `treetransforms HOWTO`_ * `util HOWTO`_ * `wordnet HOWTO`_ * `wordnet_lch HOWTO`_ nltk-3.1/nltk/test/inference.doctest0000644000076500000240000004270512607224144017342 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ==================================== Logical Inference and Model Building ==================================== >>> from nltk import * >>> from nltk.sem.drt import DrtParser >>> from nltk.sem import logic >>> logic._counter._value = 0 ------------ Introduction ------------ Within the area of automated reasoning, first order theorem proving and model building (or model generation) have both received much attention, and have given rise to highly sophisticated techniques. We focus therefore on providing an NLTK interface to third party tools for these tasks. In particular, the module ``nltk.inference`` can be used to access both theorem provers and model builders. --------------------------------- NLTK Interface to Theorem Provers --------------------------------- The main class used to interface with a theorem prover is the ``Prover`` class, found in ``nltk.api``. The ``prove()`` method takes three optional arguments: a goal, a list of assumptions, and a ``verbose`` boolean to indicate whether the proof should be printed to the console. The proof goal and any assumptions need to be instances of the ``Expression`` class specified by ``nltk.sem.logic``. There are currently three theorem provers included with NLTK: ``Prover9``, ``TableauProver``, and ``ResolutionProver``. The first is an off-the-shelf prover, while the other two are written in Python and included in the ``nltk.inference`` package. >>> from nltk.sem import Expression >>> read_expr = Expression.fromstring >>> p1 = read_expr('man(socrates)') >>> p2 = read_expr('all x.(man(x) -> mortal(x))') >>> c = read_expr('mortal(socrates)') >>> Prover9().prove(c, [p1,p2]) True >>> TableauProver().prove(c, [p1,p2]) True >>> ResolutionProver().prove(c, [p1,p2], verbose=True) [1] {-mortal(socrates)} A [2] {man(socrates)} A [3] {-man(z2), mortal(z2)} A [4] {-man(socrates)} (1, 3) [5] {mortal(socrates)} (2, 3) [6] {} (1, 5) True --------------------- The ``ProverCommand`` --------------------- A ``ProverCommand`` is a stateful holder for a theorem prover. The command stores a theorem prover instance (of type ``Prover``), a goal, a list of assumptions, the result of the proof, and a string version of the entire proof. Corresponding to the three included ``Prover`` implementations, there are three ``ProverCommand`` implementations: ``Prover9Command``, ``TableauProverCommand``, and ``ResolutionProverCommand``. The ``ProverCommand``'s constructor takes its goal and assumptions. The ``prove()`` command executes the ``Prover`` and ``proof()`` returns a String form of the proof If the ``prove()`` method has not been called, then the prover command will be unable to display a proof. >>> prover = ResolutionProverCommand(c, [p1,p2]) >>> print(prover.proof()) # doctest: +ELLIPSIS Traceback (most recent call last): File "...", line 1212, in __run compileflags, 1) in test.globs File "", line 1, in File "...", line ..., in proof raise LookupError("You have to call prove() first to get a proof!") LookupError: You have to call prove() first to get a proof! >>> prover.prove() True >>> print(prover.proof()) [1] {-mortal(socrates)} A [2] {man(socrates)} A [3] {-man(z4), mortal(z4)} A [4] {-man(socrates)} (1, 3) [5] {mortal(socrates)} (2, 3) [6] {} (1, 5) The prover command stores the result of proving so that if ``prove()`` is called again, then the command can return the result without executing the prover again. This allows the user to access the result of the proof without wasting time re-computing what it already knows. >>> prover.prove() True >>> prover.prove() True The assumptions and goal may be accessed using the ``assumptions()`` and ``goal()`` methods, respectively. >>> prover.assumptions() [, mortal(x))>] >>> prover.goal() The assumptions list may be modified using the ``add_assumptions()`` and ``retract_assumptions()`` methods. Both methods take a list of ``Expression`` objects. Since adding or removing assumptions may change the result of the proof, the stored result is cleared when either of these methods are called. That means that ``proof()`` will be unavailable until ``prove()`` is called and a call to ``prove()`` will execute the theorem prover. >>> prover.retract_assumptions([read_expr('man(socrates)')]) >>> print(prover.proof()) # doctest: +ELLIPSIS Traceback (most recent call last): File "...", line 1212, in __run compileflags, 1) in test.globs File "", line 1, in File "...", line ..., in proof raise LookupError("You have to call prove() first to get a proof!") LookupError: You have to call prove() first to get a proof! >>> prover.prove() False >>> print(prover.proof()) [1] {-mortal(socrates)} A [2] {-man(z6), mortal(z6)} A [3] {-man(socrates)} (1, 2) >>> prover.add_assumptions([read_expr('man(socrates)')]) >>> prover.prove() True ------- Prover9 ------- Prover9 Installation ~~~~~~~~~~~~~~~~~~~~ You can download Prover9 from http://www.cs.unm.edu/~mccune/prover9/. Extract the source code into a suitable directory and follow the instructions in the Prover9 ``README.make`` file to compile the executables. Install these into an appropriate location; the ``prover9_search`` variable is currently configured to look in the following locations: >>> p = Prover9() >>> p.binary_locations() # doctest: +NORMALIZE_WHITESPACE ['/usr/local/bin/prover9', '/usr/local/bin/prover9/bin', '/usr/local/bin', '/usr/bin', '/usr/local/prover9', '/usr/local/share/prover9'] Alternatively, the environment variable ``PROVER9HOME`` may be configured with the binary's location. The path to the correct directory can be set manually in the following manner: >>> config_prover9(path='/usr/local/bin') # doctest: +SKIP [Found prover9: /usr/local/bin/prover9] If the executables cannot be found, ``Prover9`` will issue a warning message: >>> p.prove() # doctest: +SKIP Traceback (most recent call last): ... LookupError: =========================================================================== NLTK was unable to find the prover9 executable! Use config_prover9() or set the PROVER9HOME environment variable. >> config_prover9('/path/to/prover9') For more information, on prover9, see: =========================================================================== Using Prover9 ~~~~~~~~~~~~~ The general case in theorem proving is to determine whether ``S |- g`` holds, where ``S`` is a possibly empty set of assumptions, and ``g`` is a proof goal. As mentioned earlier, NLTK input to ``Prover9`` must be ``Expression``\ s of ``nltk.sem.logic``. A ``Prover9`` instance is initialized with a proof goal and, possibly, some assumptions. The ``prove()`` method attempts to find a proof of the goal, given the list of assumptions (in this case, none). >>> goal = read_expr('(man(x) <-> --man(x))') >>> prover = Prover9Command(goal) >>> prover.prove() True Given a ``ProverCommand`` instance ``prover``, the method ``prover.proof()`` will return a String of the extensive proof information provided by Prover9, shown in abbreviated form here:: ============================== Prover9 =============================== Prover9 (32) version ... Process ... was started by ... on ... ... The command was ".../prover9 -f ...". ============================== end of head =========================== ============================== INPUT ================================= % Reading from file /var/... formulas(goals). (all x (man(x) -> man(x))). end_of_list. ... ============================== end of search ========================= THEOREM PROVED Exiting with 1 proof. Process 6317 exit (max_proofs) Mon Jan 21 15:23:28 2008 As mentioned earlier, we may want to list some assumptions for the proof, as shown here. >>> g = read_expr('mortal(socrates)') >>> a1 = read_expr('all x.(man(x) -> mortal(x))') >>> prover = Prover9Command(g, assumptions=[a1]) >>> prover.print_assumptions() all x.(man(x) -> mortal(x)) However, the assumptions are not sufficient to derive the goal: >>> print(prover.prove()) False So let's add another assumption: >>> a2 = read_expr('man(socrates)') >>> prover.add_assumptions([a2]) >>> prover.print_assumptions() all x.(man(x) -> mortal(x)) man(socrates) >>> print(prover.prove()) True We can also show the assumptions in ``Prover9`` format. >>> prover.print_assumptions(output_format='Prover9') all x (man(x) -> mortal(x)) man(socrates) >>> prover.print_assumptions(output_format='Spass') Traceback (most recent call last): . . . NameError: Unrecognized value for 'output_format': Spass Assumptions can be retracted from the list of assumptions. >>> prover.retract_assumptions([a1]) >>> prover.print_assumptions() man(socrates) >>> prover.retract_assumptions([a1]) Statements can be loaded from a file and parsed. We can then add these statements as new assumptions. >>> g = read_expr('all x.(boxer(x) -> -boxerdog(x))') >>> prover = Prover9Command(g) >>> prover.prove() False >>> import nltk.data >>> new = nltk.data.load('grammars/sample_grammars/background0.fol') >>> for a in new: ... print(a) all x.(boxerdog(x) -> dog(x)) all x.(boxer(x) -> person(x)) all x.-(dog(x) & person(x)) exists x.boxer(x) exists x.boxerdog(x) >>> prover.add_assumptions(new) >>> print(prover.prove()) True >>> print(prover.proof()) # doctest: +ELLIPSIS ============================== prooftrans ============================ Prover9 (...) version ... Process ... was started by ... on ... ... The command was ".../prover9". ============================== end of head =========================== ============================== end of input ========================== ============================== PROOF ================================= % -------- Comments from original proof -------- % Proof 1 at ... seconds. % Length of proof is 13. % Level of proof is 4. % Maximum clause weight is 0.000. % Given clauses 0. 1 (all x (boxerdog(x) -> dog(x))). [assumption]. 2 (all x (boxer(x) -> person(x))). [assumption]. 3 (all x -(dog(x) & person(x))). [assumption]. 6 (all x (boxer(x) -> -boxerdog(x))). [goal]. 8 -boxerdog(x) | dog(x). [clausify(1)]. 9 boxerdog(c3). [deny(6)]. 11 -boxer(x) | person(x). [clausify(2)]. 12 boxer(c3). [deny(6)]. 14 -dog(x) | -person(x). [clausify(3)]. 15 dog(c3). [resolve(9,a,8,a)]. 18 person(c3). [resolve(12,a,11,a)]. 19 -person(c3). [resolve(15,a,14,a)]. 20 $F. [resolve(19,a,18,a)]. ============================== end of proof ========================== ---------------------- The equiv() method ---------------------- One application of the theorem prover functionality is to check if two Expressions have the same meaning. The ``equiv()`` method calls a theorem prover to determine whether two Expressions are logically equivalent. >>> a = read_expr(r'exists x.(man(x) & walks(x))') >>> b = read_expr(r'exists x.(walks(x) & man(x))') >>> print(a.equiv(b)) True The same method can be used on Discourse Representation Structures (DRSs). In this case, each DRS is converted to a first order logic form, and then passed to the theorem prover. >>> dp = DrtParser() >>> a = dp.parse(r'([x],[man(x), walks(x)])') >>> b = dp.parse(r'([x],[walks(x), man(x)])') >>> print(a.equiv(b)) True -------------------------------- NLTK Interface to Model Builders -------------------------------- The top-level to model builders is parallel to that for theorem-provers. The ``ModelBuilder`` interface is located in ``nltk.inference.api``. It is currently only implemented by ``Mace``, which interfaces with the Mace4 model builder. Typically we use a model builder to show that some set of formulas has a model, and is therefore consistent. One way of doing this is by treating our candidate set of sentences as assumptions, and leaving the goal unspecified. Thus, the following interaction shows how both ``{a, c1}`` and ``{a, c2}`` are consistent sets, since Mace succeeds in a building a model for each of them, while ``{c1, c2}`` is inconsistent. >>> a3 = read_expr('exists x.(man(x) and walks(x))') >>> c1 = read_expr('mortal(socrates)') >>> c2 = read_expr('-mortal(socrates)') >>> mace = Mace() >>> print(mace.build_model(None, [a3, c1])) True >>> print(mace.build_model(None, [a3, c2])) True We can also use the model builder as an adjunct to theorem prover. Let's suppose we are trying to prove ``S |- g``, i.e. that ``g`` is logically entailed by assumptions ``S = {s1, s2, ..., sn}``. We can this same input to Mace4, and the model builder will try to find a counterexample, that is, to show that ``g`` does *not* follow from ``S``. So, given this input, Mace4 will try to find a model for the set ``S' = {s1, s2, ..., sn, (not g)}``. If ``g`` fails to follow from ``S``, then Mace4 may well return with a counterexample faster than Prover9 concludes that it cannot find the required proof. Conversely, if ``g`` *is* provable from ``S``, Mace4 may take a long time unsuccessfully trying to find a counter model, and will eventually give up. In the following example, we see that the model builder does succeed in building a model of the assumptions together with the negation of the goal. That is, it succeeds in finding a model where there is a woman that every man loves; Adam is a man; Eve is a woman; but Adam does not love Eve. >>> a4 = read_expr('exists y. (woman(y) & all x. (man(x) -> love(x,y)))') >>> a5 = read_expr('man(adam)') >>> a6 = read_expr('woman(eve)') >>> g = read_expr('love(adam,eve)') >>> print(mace.build_model(g, [a4, a5, a6])) True The Model Builder will fail to find a model if the assumptions do entail the goal. Mace will continue to look for models of ever-increasing sizes until the end_size number is reached. By default, end_size is 500, but it can be set manually for quicker response time. >>> a7 = read_expr('all x.(man(x) -> mortal(x))') >>> a8 = read_expr('man(socrates)') >>> g2 = read_expr('mortal(socrates)') >>> print(Mace(end_size=50).build_model(g2, [a7, a8])) False There is also a ``ModelBuilderCommand`` class that, like ``ProverCommand``, stores a ``ModelBuilder``, a goal, assumptions, a result, and a model. The only implementation in NLTK is ``MaceCommand``. ----- Mace4 ----- Mace4 Installation ~~~~~~~~~~~~~~~~~~ Mace4 is packaged with Prover9, and can be downloaded from the same source, namely http://www.cs.unm.edu/~mccune/prover9/. It is installed in the same manner as Prover9. Using Mace4 ~~~~~~~~~~~ Check whether Mace4 can find a model. >>> a = read_expr('(see(mary,john) & -(mary = john))') >>> mb = MaceCommand(assumptions=[a]) >>> mb.build_model() True Show the model in 'tabular' format. >>> print(mb.model(format='tabular')) % number = 1 % seconds = 0 % Interpretation of size 2 john : 0 mary : 1 see : | 0 1 ---+---- 0 | 0 0 1 | 1 0 Show the model in 'tabular' format. >>> print(mb.model(format='cooked')) % number = 1 % seconds = 0 % Interpretation of size 2 john = 0. mary = 1. - see(0,0). - see(0,1). see(1,0). - see(1,1). The property ``valuation`` accesses the stored ``Valuation``. >>> print(mb.valuation) {'john': 'a', 'mary': 'b', 'see': {('b', 'a')}} We can return to our earlier example and inspect the model: >>> mb = MaceCommand(g, assumptions=[a4, a5, a6]) >>> m = mb.build_model() >>> print(mb.model(format='cooked')) % number = 1 % seconds = 0 % Interpretation of size 2 adam = 0. eve = 0. c1 = 1. man(0). - man(1). woman(0). woman(1). - love(0,0). love(0,1). - love(1,0). - love(1,1). Here, we can see that ``adam`` and ``eve`` have been assigned the same individual, namely ``0`` as value; ``0`` is both a man and a woman; a second individual ``1`` is also a woman; and ``0`` loves ``1``. Thus, this is an interpretation in which there is a woman that every man loves but Adam doesn't love Eve. Mace can also be used with propositional logic. >>> p = read_expr('P') >>> q = read_expr('Q') >>> mb = MaceCommand(q, [p, p>-q]) >>> mb.build_model() True >>> mb.valuation['P'] True >>> mb.valuation['Q'] False nltk-3.1/nltk/test/inference_fixt.py0000644000076500000240000000051712574600335017355 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import def setup_module(module): from nose import SkipTest from nltk.inference.mace import Mace try: m = Mace() m._find_binary('mace4') except LookupError: raise SkipTest("Mace4/Prover9 is not available so inference.doctest was skipped") nltk-3.1/nltk/test/internals.doctest0000644000076500000240000000711312607224144017375 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ========================================== Unit tests for the nltk.utilities module ========================================== overridden() ~~~~~~~~~~~~ >>> from nltk.internals import overridden The typical use case is in defining methods for an interface or abstract base class, in such a way that subclasses don't have to implement all of the methods: >>> class EaterI(object): ... '''Subclass must define eat() or batch_eat().''' ... def eat(self, food): ... if overridden(self.batch_eat): ... return self.batch_eat([food])[0] ... else: ... raise NotImplementedError() ... def batch_eat(self, foods): ... return [self.eat(food) for food in foods] As long as a subclass implements one method, it will be used to perform the other method: >>> class GoodEater1(EaterI): ... def eat(self, food): ... return 'yum' >>> GoodEater1().eat('steak') 'yum' >>> GoodEater1().batch_eat(['steak', 'peas']) ['yum', 'yum'] >>> class GoodEater2(EaterI): ... def batch_eat(self, foods): ... return ['yum' for food in foods] >>> GoodEater2().eat('steak') 'yum' >>> GoodEater2().batch_eat(['steak', 'peas']) ['yum', 'yum'] But if a subclass doesn't implement either one, then they'll get an error when they try to call them. (nb this is better than infinite recursion): >>> class BadEater1(EaterI): ... pass >>> BadEater1().eat('steak') Traceback (most recent call last): . . . NotImplementedError >>> BadEater1().batch_eat(['steak', 'peas']) Traceback (most recent call last): . . . NotImplementedError Trying to use the abstract base class itself will also result in an error: >>> class EaterI(EaterI): ... pass >>> EaterI().eat('steak') Traceback (most recent call last): . . . NotImplementedError >>> EaterI().batch_eat(['steak', 'peas']) Traceback (most recent call last): . . . NotImplementedError It's ok to use intermediate abstract classes: >>> class AbstractEater(EaterI): ... pass >>> class GoodEater3(AbstractEater): ... def eat(self, food): ... return 'yum' ... >>> GoodEater3().eat('steak') 'yum' >>> GoodEater3().batch_eat(['steak', 'peas']) ['yum', 'yum'] >>> class GoodEater4(AbstractEater): ... def batch_eat(self, foods): ... return ['yum' for food in foods] >>> GoodEater4().eat('steak') 'yum' >>> GoodEater4().batch_eat(['steak', 'peas']) ['yum', 'yum'] >>> class BadEater2(AbstractEater): ... pass >>> BadEater2().eat('steak') Traceback (most recent call last): . . . NotImplementedError >>> BadEater2().batch_eat(['steak', 'peas']) Traceback (most recent call last): . . . NotImplementedError Here's some extra tests: >>> class A(object): ... def f(x): pass >>> class B(A): ... def f(x): pass >>> class C(A): pass >>> class D(B): pass >>> overridden(A().f) False >>> overridden(B().f) True >>> overridden(C().f) False >>> overridden(D().f) True It works for classic classes, too: >>> class A: ... def f(x): pass >>> class B(A): ... def f(x): pass >>> class C(A): pass >>> class D(B): pass >>> overridden(A().f) False >>> overridden(B().f) True >>> overridden(C().f) False >>> overridden(D().f) True nltk-3.1/nltk/test/japanese.doctest0000644000076500000240000000202512607224144017161 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ============================ Japanese Language Processing ============================ >>> from nltk import * ------------- Corpus Access ------------- KNB Corpus ---------- >>> from nltk.corpus import knbc Access the words: this should produce a list of strings: >>> type(knbc.words()[0]) is not bytes True Access the sentences: this should produce a list of lists of strings: >>> type(knbc.sents()[0][0]) is not bytes True Access the tagged words: this should produce a list of word, tag pairs: >>> type(knbc.tagged_words()[0]) <... 'tuple'> Access the tagged sentences: this should produce a list of lists of word, tag pairs: >>> type(knbc.tagged_sents()[0][0]) <... 'tuple'> JEITA Corpus ------------ >>> from nltk.corpus import jeita Access the tagged words: this should produce a list of word, tag pairs, where a tag is a string: >>> type(jeita.tagged_words()[0][1]) is not bytes True nltk-3.1/nltk/test/logic.doctest0000644000076500000240000010276512607224144016504 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ======================= Logic & Lambda Calculus ======================= The `nltk.logic` package allows expressions of First-Order Logic (FOL) to be parsed into ``Expression`` objects. In addition to FOL, the parser handles lambda-abstraction with variables of higher order. -------- Overview -------- >>> from nltk.sem.logic import * The default inventory of logical constants is the following: >>> boolean_ops() # doctest: +NORMALIZE_WHITESPACE negation - conjunction & disjunction | implication -> equivalence <-> >>> equality_preds() # doctest: +NORMALIZE_WHITESPACE equality = inequality != >>> binding_ops() # doctest: +NORMALIZE_WHITESPACE existential exists universal all lambda \ ---------------- Regression Tests ---------------- Untyped Logic +++++++++++++ Process logical expressions conveniently: >>> read_expr = Expression.fromstring Test for equality under alpha-conversion ======================================== >>> e1 = read_expr('exists x.P(x)') >>> print(e1) exists x.P(x) >>> e2 = e1.alpha_convert(Variable('z')) >>> print(e2) exists z.P(z) >>> e1 == e2 True >>> l = read_expr(r'\X.\X.X(X)(1)').simplify() >>> id = read_expr(r'\X.X(X)') >>> l == id True Test numerals ============= >>> zero = read_expr(r'\F x.x') >>> one = read_expr(r'\F x.F(x)') >>> two = read_expr(r'\F x.F(F(x))') >>> three = read_expr(r'\F x.F(F(F(x)))') >>> four = read_expr(r'\F x.F(F(F(F(x))))') >>> succ = read_expr(r'\N F x.F(N(F,x))') >>> plus = read_expr(r'\M N F x.M(F,N(F,x))') >>> mult = read_expr(r'\M N F.M(N(F))') >>> pred = read_expr(r'\N F x.(N(\G H.H(G(F)))(\u.x)(\u.u))') >>> v1 = ApplicationExpression(succ, zero).simplify() >>> v1 == one True >>> v2 = ApplicationExpression(succ, v1).simplify() >>> v2 == two True >>> v3 = ApplicationExpression(ApplicationExpression(plus, v1), v2).simplify() >>> v3 == three True >>> v4 = ApplicationExpression(ApplicationExpression(mult, v2), v2).simplify() >>> v4 == four True >>> v5 = ApplicationExpression(pred, ApplicationExpression(pred, v4)).simplify() >>> v5 == two True Overloaded operators also exist, for convenience. >>> print(succ(zero).simplify() == one) True >>> print(plus(one,two).simplify() == three) True >>> print(mult(two,two).simplify() == four) True >>> print(pred(pred(four)).simplify() == two) True >>> john = read_expr(r'john') >>> man = read_expr(r'\x.man(x)') >>> walk = read_expr(r'\x.walk(x)') >>> man(john).simplify() >>> print(-walk(john).simplify()) -walk(john) >>> print((man(john) & walk(john)).simplify()) (man(john) & walk(john)) >>> print((man(john) | walk(john)).simplify()) (man(john) | walk(john)) >>> print((man(john) > walk(john)).simplify()) (man(john) -> walk(john)) >>> print((man(john) < walk(john)).simplify()) (man(john) <-> walk(john)) Python's built-in lambda operator can also be used with Expressions >>> john = VariableExpression(Variable('john')) >>> run_var = VariableExpression(Variable('run')) >>> run = lambda x: run_var(x) >>> run(john) ``betaConversionTestSuite.pl`` ------------------------------ Tests based on Blackburn & Bos' book, *Representation and Inference for Natural Language*. >>> x1 = read_expr(r'\P.P(mia)(\x.walk(x))').simplify() >>> x2 = read_expr(r'walk(mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'exists x.(man(x) & ((\P.exists x.(woman(x) & P(x)))(\y.love(x,y))))').simplify() >>> x2 = read_expr(r'exists x.(man(x) & exists y.(woman(y) & love(x,y)))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\a.sleep(a)(mia)').simplify() >>> x2 = read_expr(r'sleep(mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\a.\b.like(b,a)(mia)').simplify() >>> x2 = read_expr(r'\b.like(b,mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\a.(\b.like(b,a)(vincent))').simplify() >>> x2 = read_expr(r'\a.like(vincent,a)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\a.((\b.like(b,a)(vincent)) & sleep(a))').simplify() >>> x2 = read_expr(r'\a.(like(vincent,a) & sleep(a))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\a.\b.like(b,a)(mia)(vincent))').simplify() >>> x2 = read_expr(r'like(vincent,mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'P((\a.sleep(a)(vincent)))').simplify() >>> x2 = read_expr(r'P(sleep(vincent))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.A((\b.sleep(b)(vincent)))').simplify() >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.A(sleep(vincent))').simplify() >>> x2 = read_expr(r'\A.A(sleep(vincent))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\A.A(vincent)(\b.sleep(b)))').simplify() >>> x2 = read_expr(r'sleep(vincent)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.believe(mia,A(vincent))(\b.sleep(b))').simplify() >>> x2 = read_expr(r'believe(mia,sleep(vincent))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\A.(A(vincent) & A(mia)))(\b.sleep(b))').simplify() >>> x2 = read_expr(r'(sleep(vincent) & sleep(mia))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.\B.(\C.C(A(vincent))(\d.probably(d)) & (\C.C(B(mia))(\d.improbably(d))))(\f.walk(f))(\f.talk(f))').simplify() >>> x2 = read_expr(r'(probably(walk(vincent)) & improbably(talk(mia)))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\d.\f.love(d,f))))(jules)(mia)').simplify() >>> x2 = read_expr(r'love(jules,mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\A.\B.exists c.(A(c) & B(c)))(\d.boxer(d),\d.sleep(d))').simplify() >>> x2 = read_expr(r'exists c.(boxer(c) & sleep(c))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.Z(A)(\c.\a.like(a,c))').simplify() >>> x2 = read_expr(r'Z(\c.\a.like(a,c))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'\A.\b.A(b)(\c.\b.like(b,c))').simplify() >>> x2 = read_expr(r'\b.(\c.\b.like(b,c)(b))').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\a.\b.(\C.C(a,b)(\b.\a.loves(b,a))))(jules)(mia)').simplify() >>> x2 = read_expr(r'loves(jules,mia)').simplify() >>> x1 == x2 True >>> x1 = read_expr(r'(\A.\b.(exists b.A(b) & A(b)))(\c.boxer(c))(vincent)').simplify() >>> x2 = read_expr(r'((exists b.boxer(b)) & boxer(vincent))').simplify() >>> x1 == x2 True Test Parser =========== >>> print(read_expr(r'john')) john >>> print(read_expr(r'x')) x >>> print(read_expr(r'-man(x)')) -man(x) >>> print(read_expr(r'--man(x)')) --man(x) >>> print(read_expr(r'(man(x))')) man(x) >>> print(read_expr(r'((man(x)))')) man(x) >>> print(read_expr(r'man(x) <-> tall(x)')) (man(x) <-> tall(x)) >>> print(read_expr(r'(man(x) <-> tall(x))')) (man(x) <-> tall(x)) >>> print(read_expr(r'(man(x) & tall(x) & walks(x))')) (man(x) & tall(x) & walks(x)) >>> print(read_expr(r'(man(x) & tall(x) & walks(x))').first) (man(x) & tall(x)) >>> print(read_expr(r'man(x) | tall(x) & walks(x)')) (man(x) | (tall(x) & walks(x))) >>> print(read_expr(r'((man(x) & tall(x)) | walks(x))')) ((man(x) & tall(x)) | walks(x)) >>> print(read_expr(r'man(x) & (tall(x) | walks(x))')) (man(x) & (tall(x) | walks(x))) >>> print(read_expr(r'(man(x) & (tall(x) | walks(x)))')) (man(x) & (tall(x) | walks(x))) >>> print(read_expr(r'P(x) -> Q(x) <-> R(x) | S(x) & T(x)')) ((P(x) -> Q(x)) <-> (R(x) | (S(x) & T(x)))) >>> print(read_expr(r'exists x.man(x)')) exists x.man(x) >>> print(read_expr(r'exists x.(man(x) & tall(x))')) exists x.(man(x) & tall(x)) >>> print(read_expr(r'exists x.(man(x) & tall(x) & walks(x))')) exists x.(man(x) & tall(x) & walks(x)) >>> print(read_expr(r'-P(x) & Q(x)')) (-P(x) & Q(x)) >>> read_expr(r'-P(x) & Q(x)') == read_expr(r'(-P(x)) & Q(x)') True >>> print(read_expr(r'\x.man(x)')) \x.man(x) >>> print(read_expr(r'\x.man(x)(john)')) \x.man(x)(john) >>> print(read_expr(r'\x.man(x)(john) & tall(x)')) (\x.man(x)(john) & tall(x)) >>> print(read_expr(r'\x.\y.sees(x,y)')) \x y.sees(x,y) >>> print(read_expr(r'\x y.sees(x,y)')) \x y.sees(x,y) >>> print(read_expr(r'\x.\y.sees(x,y)(a)')) (\x y.sees(x,y))(a) >>> print(read_expr(r'\x y.sees(x,y)(a)')) (\x y.sees(x,y))(a) >>> print(read_expr(r'\x.\y.sees(x,y)(a)(b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'\x y.sees(x,y)(a)(b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'\x.\y.sees(x,y)(a,b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'\x y.sees(x,y)(a,b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'((\x.\y.sees(x,y))(a))(b)')) ((\x y.sees(x,y))(a))(b) >>> print(read_expr(r'P(x)(y)(z)')) P(x,y,z) >>> print(read_expr(r'P(Q)')) P(Q) >>> print(read_expr(r'P(Q(x))')) P(Q(x)) >>> print(read_expr(r'(\x.exists y.walks(x,y))(x)')) (\x.exists y.walks(x,y))(x) >>> print(read_expr(r'exists x.(x = john)')) exists x.(x = john) >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))')) ((\P Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x)) >>> a = read_expr(r'exists c.exists b.A(b,c) & A(b,c)') >>> b = read_expr(r'(exists c.(exists b.A(b,c))) & A(b,c)') >>> print(a == b) True >>> a = read_expr(r'exists c.(exists b.A(b,c) & A(b,c))') >>> b = read_expr(r'exists c.((exists b.A(b,c)) & A(b,c))') >>> print(a == b) True >>> print(read_expr(r'exists x.x = y')) exists x.(x = y) >>> print(read_expr('A(B)(C)')) A(B,C) >>> print(read_expr('(A(B))(C)')) A(B,C) >>> print(read_expr('A((B)(C))')) A(B(C)) >>> print(read_expr('A(B(C))')) A(B(C)) >>> print(read_expr('(A)(B(C))')) A(B(C)) >>> print(read_expr('(((A)))(((B))(((C))))')) A(B(C)) >>> print(read_expr(r'A != B')) -(A = B) >>> print(read_expr('P(x) & x=y & P(y)')) (P(x) & (x = y) & P(y)) >>> try: print(read_expr(r'\walk.walk(x)')) ... except LogicalExpressionException as e: print(e) 'walk' is an illegal variable name. Constants may not be abstracted. \walk.walk(x) ^ >>> try: print(read_expr(r'all walk.walk(john)')) ... except LogicalExpressionException as e: print(e) 'walk' is an illegal variable name. Constants may not be quantified. all walk.walk(john) ^ >>> try: print(read_expr(r'x(john)')) ... except LogicalExpressionException as e: print(e) 'x' is an illegal predicate name. Individual variables may not be used as predicates. x(john) ^ >>> from nltk.sem.logic import LogicParser # hack to give access to custom quote chars >>> lpq = LogicParser() >>> lpq.quote_chars = [("'", "'", "\\", False)] >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )")) (man(x) & tall's,(x) & walks(x)) >>> lpq.quote_chars = [("'", "'", "\\", True)] >>> print(lpq.parse(r"'tall\'s,'")) 'tall\'s,' >>> print(lpq.parse(r"'spaced name(x)'")) 'spaced name(x)' >>> print(lpq.parse(r"-'tall\'s,'(x)")) -'tall\'s,'(x) >>> print(lpq.parse(r"(man(x) & 'tall\'s,' (x) & walks (x) )")) (man(x) & 'tall\'s,'(x) & walks(x)) Simplify ======== >>> print(read_expr(r'\x.man(x)(john)').simplify()) man(john) >>> print(read_expr(r'\x.((man(x)))(john)').simplify()) man(john) >>> print(read_expr(r'\x.\y.sees(x,y)(john, mary)').simplify()) sees(john,mary) >>> print(read_expr(r'\x y.sees(x,y)(john, mary)').simplify()) sees(john,mary) >>> print(read_expr(r'\x.\y.sees(x,y)(john)(mary)').simplify()) sees(john,mary) >>> print(read_expr(r'\x y.sees(x,y)(john)(mary)').simplify()) sees(john,mary) >>> print(read_expr(r'\x.\y.sees(x,y)(john)').simplify()) \y.sees(john,y) >>> print(read_expr(r'\x y.sees(x,y)(john)').simplify()) \y.sees(john,y) >>> print(read_expr(r'(\x.\y.sees(x,y)(john))(mary)').simplify()) sees(john,mary) >>> print(read_expr(r'(\x y.sees(x,y)(john))(mary)').simplify()) sees(john,mary) >>> print(read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(x))').simplify()) exists x.(man(x) & exists y.walks(x,y)) >>> e1 = read_expr(r'exists x.(man(x) & (\x.exists y.walks(x,y))(y))').simplify() >>> e2 = read_expr(r'exists x.(man(x) & exists z1.walks(y,z1))') >>> e1 == e2 True >>> print(read_expr(r'(\P Q.exists x.(P(x) & Q(x)))(\x.dog(x))').simplify()) \Q.exists x.(dog(x) & Q(x)) >>> print(read_expr(r'((\P.\Q.exists x.(P(x) & Q(x)))(\x.dog(x)))(\x.bark(x))').simplify()) exists x.(dog(x) & bark(x)) >>> print(read_expr(r'\P.(P(x)(y))(\a b.Q(a,b))').simplify()) Q(x,y) Replace ======= >>> a = read_expr(r'a') >>> x = read_expr(r'x') >>> y = read_expr(r'y') >>> z = read_expr(r'z') >>> print(read_expr(r'man(x)').replace(x.variable, a, False)) man(a) >>> print(read_expr(r'(man(x) & tall(x))').replace(x.variable, a, False)) (man(a) & tall(a)) >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, False)) exists x.man(x) >>> print(read_expr(r'exists x.man(x)').replace(x.variable, a, True)) exists a.man(a) >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, False)) exists x.give(x,a,z) >>> print(read_expr(r'exists x.give(x,y,z)').replace(y.variable, a, True)) exists x.give(x,a,z) >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, False) >>> e2 = read_expr(r'exists z1.give(z1,x,z)') >>> e1 == e2 True >>> e1 = read_expr(r'exists x.give(x,y,z)').replace(y.variable, x, True) >>> e2 = read_expr(r'exists z1.give(z1,x,z)') >>> e1 == e2 True >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, False)) \x y z.give(x,y,z) >>> print(read_expr(r'\x y z.give(x,y,z)').replace(y.variable, a, True)) \x a z.give(x,a,z) >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, False)) \x y.give(x,y,a) >>> print(read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, a, True)) \x y.give(x,y,a) >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, False) >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)') >>> e1 == e2 True >>> e1 = read_expr(r'\x.\y.give(x,y,z)').replace(z.variable, x, True) >>> e2 = read_expr(r'\z1.\y.give(z1,y,x)') >>> e1 == e2 True >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, False)) \x.give(x,y,y) >>> print(read_expr(r'\x.give(x,y,z)').replace(z.variable, y, True)) \x.give(x,y,y) >>> from nltk.sem import logic >>> logic._counter._value = 0 >>> e1 = read_expr('e1') >>> e2 = read_expr('e2') >>> print(read_expr('exists e1 e2.(walk(e1) & talk(e2))').replace(e1.variable, e2, True)) exists e2 e01.(walk(e2) & talk(e01)) Variables / Free ================ >>> examples = [r'walk(john)', ... r'walk(x)', ... r'?vp(?np)', ... r'see(john,mary)', ... r'exists x.walk(x)', ... r'\x.see(john,x)', ... r'\x.see(john,x)(mary)', ... r'P(x)', ... r'\P.P(x)', ... r'aa(x,bb(y),cc(z),P(w),u)', ... r'bo(?det(?n),@x)'] >>> examples = [read_expr(e) for e in examples] >>> for e in examples: ... print('%-25s' % e, sorted(e.free())) walk(john) [] walk(x) [Variable('x')] ?vp(?np) [] see(john,mary) [] exists x.walk(x) [] \x.see(john,x) [] (\x.see(john,x))(mary) [] P(x) [Variable('P'), Variable('x')] \P.P(x) [Variable('x')] aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] bo(?det(?n),@x) [] >>> for e in examples: ... print('%-25s' % e, sorted(e.constants())) walk(john) [Variable('john')] walk(x) [] ?vp(?np) [Variable('?np')] see(john,mary) [Variable('john'), Variable('mary')] exists x.walk(x) [] \x.see(john,x) [Variable('john')] (\x.see(john,x))(mary) [Variable('john'), Variable('mary')] P(x) [] \P.P(x) [] aa(x,bb(y),cc(z),P(w),u) [] bo(?det(?n),@x) [Variable('?n'), Variable('@x')] >>> for e in examples: ... print('%-25s' % e, sorted(e.predicates())) walk(john) [Variable('walk')] walk(x) [Variable('walk')] ?vp(?np) [Variable('?vp')] see(john,mary) [Variable('see')] exists x.walk(x) [Variable('walk')] \x.see(john,x) [Variable('see')] (\x.see(john,x))(mary) [Variable('see')] P(x) [] \P.P(x) [] aa(x,bb(y),cc(z),P(w),u) [Variable('aa'), Variable('bb'), Variable('cc')] bo(?det(?n),@x) [Variable('?det'), Variable('bo')] >>> for e in examples: ... print('%-25s' % e, sorted(e.variables())) walk(john) [] walk(x) [Variable('x')] ?vp(?np) [Variable('?np'), Variable('?vp')] see(john,mary) [] exists x.walk(x) [] \x.see(john,x) [] (\x.see(john,x))(mary) [] P(x) [Variable('P'), Variable('x')] \P.P(x) [Variable('x')] aa(x,bb(y),cc(z),P(w),u) [Variable('P'), Variable('u'), Variable('w'), Variable('x'), Variable('y'), Variable('z')] bo(?det(?n),@x) [Variable('?det'), Variable('?n'), Variable('@x')] `normalize` >>> print(read_expr(r'\e083.(walk(e083, z472) & talk(e092, z938))').normalize()) \e01.(walk(e01,z3) & talk(e02,z4)) Typed Logic +++++++++++ >>> from nltk.sem.logic import LogicParser >>> tlp = LogicParser(True) >>> print(tlp.parse(r'man(x)').type) ? >>> print(tlp.parse(r'walk(angus)').type) ? >>> print(tlp.parse(r'-man(x)').type) t >>> print(tlp.parse(r'(man(x) <-> tall(x))').type) t >>> print(tlp.parse(r'exists x.(man(x) & tall(x))').type) t >>> print(tlp.parse(r'\x.man(x)').type) >>> print(tlp.parse(r'john').type) e >>> print(tlp.parse(r'\x y.sees(x,y)').type) > >>> print(tlp.parse(r'\x.man(x)(john)').type) ? >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)').type) >>> print(tlp.parse(r'\x.\y.sees(x,y)(john)(mary)').type) ? >>> print(tlp.parse(r'\P.\Q.exists x.(P(x) & Q(x))').type) <,<,t>> >>> print(tlp.parse(r'\x.y').type) >>> print(tlp.parse(r'\P.P(x)').type) <,?> >>> parsed = tlp.parse('see(john,mary)') >>> print(parsed.type) ? >>> print(parsed.function) see(john) >>> print(parsed.function.type) >>> print(parsed.function.function) see >>> print(parsed.function.function.type) > >>> parsed = tlp.parse('P(x,y)') >>> print(parsed) P(x,y) >>> print(parsed.type) ? >>> print(parsed.function) P(x) >>> print(parsed.function.type) >>> print(parsed.function.function) P >>> print(parsed.function.function.type) > >>> print(tlp.parse(r'P').type) ? >>> print(tlp.parse(r'P', {'P': 't'}).type) t >>> a = tlp.parse(r'P(x)') >>> print(a.type) ? >>> print(a.function.type) >>> print(a.argument.type) e >>> a = tlp.parse(r'-P(x)') >>> print(a.type) t >>> print(a.term.type) t >>> print(a.term.function.type) >>> print(a.term.argument.type) e >>> a = tlp.parse(r'P & Q') >>> print(a.type) t >>> print(a.first.type) t >>> print(a.second.type) t >>> a = tlp.parse(r'(P(x) & Q(x))') >>> print(a.type) t >>> print(a.first.type) t >>> print(a.first.function.type) >>> print(a.first.argument.type) e >>> print(a.second.type) t >>> print(a.second.function.type) >>> print(a.second.argument.type) e >>> a = tlp.parse(r'\x.P(x)') >>> print(a.type) >>> print(a.term.function.type) >>> print(a.term.argument.type) e >>> a = tlp.parse(r'\P.P(x)') >>> print(a.type) <,?> >>> print(a.term.function.type) >>> print(a.term.argument.type) e >>> a = tlp.parse(r'(\x.P(x)(john)) & Q(x)') >>> print(a.type) t >>> print(a.first.type) t >>> print(a.first.function.type) >>> print(a.first.function.term.function.type) >>> print(a.first.function.term.argument.type) e >>> print(a.first.argument.type) e >>> a = tlp.parse(r'\x y.P(x,y)(john)(mary) & Q(x)') >>> print(a.type) t >>> print(a.first.type) t >>> print(a.first.function.type) >>> print(a.first.function.function.type) > >>> a = tlp.parse(r'--P') >>> print(a.type) t >>> print(a.term.type) t >>> print(a.term.term.type) t >>> tlp.parse(r'\x y.P(x,y)').type > >>> tlp.parse(r'\x y.P(x,y)', {'P': '>'}).type > >>> a = tlp.parse(r'\P y.P(john,y)(\x y.see(x,y))') >>> a.type >>> a.function.type <>,> >>> a.function.term.term.function.function.type > >>> a.argument.type > >>> a = tlp.parse(r'exists c f.(father(c) = f)') >>> a.type t >>> a.term.term.type t >>> a.term.term.first.type e >>> a.term.term.first.function.type >>> a.term.term.second.type e typecheck() >>> a = tlp.parse('P(x)') >>> b = tlp.parse('Q(x)') >>> a.type ? >>> c = a & b >>> c.first.type ? >>> c.typecheck() # doctest: +ELLIPSIS {...} >>> c.first.type t >>> a = tlp.parse('P(x)') >>> b = tlp.parse('P(x) & Q(x)') >>> a.type ? >>> typecheck([a,b]) # doctest: +ELLIPSIS {...} >>> a.type t >>> e = tlp.parse(r'man(x)') >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': ''}) True >>> sig = {'man': ''} >>> e = tlp.parse(r'man(x)', sig) >>> print(e.function.type) >>> print(dict((k,str(v)) for k,v in e.typecheck().items()) == {'x': 'e', 'man': ''}) True >>> print(e.function.type) >>> print(dict((k,str(v)) for k,v in e.typecheck(sig).items()) == {'x': 'e', 'man': ''}) True findtype() >>> print(tlp.parse(r'man(x)').findtype(Variable('man'))) >>> print(tlp.parse(r'see(x,y)').findtype(Variable('see'))) > >>> print(tlp.parse(r'P(Q(R(x)))').findtype(Variable('Q'))) ? reading types from strings >>> Type.fromstring('e') e >>> Type.fromstring('') >>> Type.fromstring('<,>') <,> >>> Type.fromstring('<,?>') <,?> alternative type format >>> Type.fromstring('e').str() 'IND' >>> Type.fromstring('').str() '(IND -> ANY)' >>> Type.fromstring('<,t>').str() '((IND -> BOOL) -> BOOL)' Type.__eq__() >>> from nltk.sem.logic import * >>> e = ENTITY_TYPE >>> t = TRUTH_TYPE >>> a = ANY_TYPE >>> et = ComplexType(e,t) >>> eet = ComplexType(e,ComplexType(e,t)) >>> at = ComplexType(a,t) >>> ea = ComplexType(e,a) >>> aa = ComplexType(a,a) >>> e == e True >>> t == t True >>> e == t False >>> a == t False >>> t == a False >>> a == a True >>> et == et True >>> a == et False >>> et == a False >>> a == ComplexType(a,aa) True >>> ComplexType(a,aa) == a True matches() >>> e.matches(t) False >>> a.matches(t) True >>> t.matches(a) True >>> a.matches(et) True >>> et.matches(a) True >>> ea.matches(eet) True >>> eet.matches(ea) True >>> aa.matches(et) True >>> aa.matches(t) True Type error during parsing ========================= >>> try: print(tlp.parse(r'exists x y.(P(x) & P(x,y))')) ... except InconsistentTypeHierarchyException as e: print(e) The variable 'P' was found in multiple places with different types. >>> try: tlp.parse(r'\x y.see(x,y)(\x.man(x))') ... except TypeException as e: print(e) The function '\x y.see(x,y)' is of type '>' and cannot be applied to '\x.man(x)' of type ''. Its argument must match type 'e'. >>> try: tlp.parse(r'\P x y.-P(x,y)(\x.-man(x))') ... except TypeException as e: print(e) The function '\P x y.-P(x,y)' is of type '<>,>>' and cannot be applied to '\x.-man(x)' of type ''. Its argument must match type '>'. >>> a = tlp.parse(r'-talk(x)') >>> signature = a.typecheck() >>> try: print(tlp.parse(r'-talk(x,y)', signature)) ... except InconsistentTypeHierarchyException as e: print(e) The variable 'talk' was found in multiple places with different types. >>> a = tlp.parse(r'-P(x)') >>> b = tlp.parse(r'-P(x,y)') >>> a.typecheck() # doctest: +ELLIPSIS {...} >>> b.typecheck() # doctest: +ELLIPSIS {...} >>> try: typecheck([a,b]) ... except InconsistentTypeHierarchyException as e: print(e) The variable 'P' was found in multiple places with different types. >>> a = tlp.parse(r'P(x)') >>> b = tlp.parse(r'P(x,y)') >>> signature = {'P': ''} >>> a.typecheck(signature) # doctest: +ELLIPSIS {...} >>> try: typecheck([a,b], signature) ... except InconsistentTypeHierarchyException as e: print(e) The variable 'P' was found in multiple places with different types. Parse errors ============ >>> try: read_expr(r'') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. ^ >>> try: read_expr(r'(') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. ( ^ >>> try: read_expr(r')') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. ) ^ >>> try: read_expr(r'()') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. () ^ >>> try: read_expr(r'(P(x) & Q(x)') ... except LogicalExpressionException as e: print(e) End of input found. Expected token ')'. (P(x) & Q(x) ^ >>> try: read_expr(r'(P(x) &') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. (P(x) & ^ >>> try: read_expr(r'(P(x) | )') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. (P(x) | ) ^ >>> try: read_expr(r'P(x) ->') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. P(x) -> ^ >>> try: read_expr(r'P(x') ... except LogicalExpressionException as e: print(e) End of input found. Expected token ')'. P(x ^ >>> try: read_expr(r'P(x,') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. P(x, ^ >>> try: read_expr(r'P(x,)') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. P(x,) ^ >>> try: read_expr(r'exists') ... except LogicalExpressionException as e: print(e) End of input found. Variable and Expression expected following quantifier 'exists'. exists ^ >>> try: read_expr(r'exists x') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. exists x ^ >>> try: read_expr(r'exists x.') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. exists x. ^ >>> try: read_expr(r'\ ') ... except LogicalExpressionException as e: print(e) End of input found. Variable and Expression expected following lambda operator. \ ^ >>> try: read_expr(r'\ x') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. \ x ^ >>> try: read_expr(r'\ x y') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. \ x y ^ >>> try: read_expr(r'\ x.') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. \ x. ^ >>> try: read_expr(r'P(x)Q(x)') ... except LogicalExpressionException as e: print(e) Unexpected token: 'Q'. P(x)Q(x) ^ >>> try: read_expr(r'(P(x)Q(x)') ... except LogicalExpressionException as e: print(e) Unexpected token: 'Q'. Expected token ')'. (P(x)Q(x) ^ >>> try: read_expr(r'exists x y') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. exists x y ^ >>> try: read_expr(r'exists x y.') ... except LogicalExpressionException as e: print(e) End of input found. Expression expected. exists x y. ^ >>> try: read_expr(r'exists x -> y') ... except LogicalExpressionException as e: print(e) Unexpected token: '->'. Expression expected. exists x -> y ^ >>> try: read_expr(r'A -> ((P(x) & Q(x)) -> Z') ... except LogicalExpressionException as e: print(e) End of input found. Expected token ')'. A -> ((P(x) & Q(x)) -> Z ^ >>> try: read_expr(r'A -> ((P(x) &) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> ((P(x) &) -> Z ^ >>> try: read_expr(r'A -> ((P(x) | )) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> ((P(x) | )) -> Z ^ >>> try: read_expr(r'A -> (P(x) ->) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (P(x) ->) -> Z ^ >>> try: read_expr(r'A -> (P(x) -> Z') ... except LogicalExpressionException as e: print(e) End of input found. Expected token ')'. A -> (P(x) -> Z ^ >>> try: read_expr(r'A -> (P(x,) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (P(x,) -> Z ^ >>> try: read_expr(r'A -> (P(x,)) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (P(x,)) -> Z ^ >>> try: read_expr(r'A -> (exists) -> Z') ... except LogicalExpressionException as e: print(e) ')' is an illegal variable name. Constants may not be quantified. A -> (exists) -> Z ^ >>> try: read_expr(r'A -> (exists x) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (exists x) -> Z ^ >>> try: read_expr(r'A -> (exists x.) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (exists x.) -> Z ^ >>> try: read_expr(r'A -> (\ ) -> Z') ... except LogicalExpressionException as e: print(e) ')' is an illegal variable name. Constants may not be abstracted. A -> (\ ) -> Z ^ >>> try: read_expr(r'A -> (\ x) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (\ x) -> Z ^ >>> try: read_expr(r'A -> (\ x y) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (\ x y) -> Z ^ >>> try: read_expr(r'A -> (\ x.) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (\ x.) -> Z ^ >>> try: read_expr(r'A -> (P(x)Q(x)) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: 'Q'. Expected token ')'. A -> (P(x)Q(x)) -> Z ^ >>> try: read_expr(r'A -> ((P(x)Q(x)) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: 'Q'. Expected token ')'. A -> ((P(x)Q(x)) -> Z ^ >>> try: read_expr(r'A -> (all x y) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (all x y) -> Z ^ >>> try: read_expr(r'A -> (exists x y.) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: ')'. Expression expected. A -> (exists x y.) -> Z ^ >>> try: read_expr(r'A -> (exists x -> y) -> Z') ... except LogicalExpressionException as e: print(e) Unexpected token: '->'. Expression expected. A -> (exists x -> y) -> Z ^ nltk-3.1/nltk/test/metrics.doctest0000644000076500000240000002224112607224144017043 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ======= Metrics ======= The `nltk.metrics` package provides a variety of *evaluation measures* which can be used for a wide variety of NLP tasks. >>> from __future__ import print_function >>> from nltk.metrics import * ------------------ Standard IR Scores ------------------ We can use standard scores from information retrieval to test the performance of taggers, chunkers, etc. >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() >>> print(accuracy(reference, test)) 0.8 The following measures apply to sets: >>> reference_set = set(reference) >>> test_set = set(test) >>> precision(reference_set, test_set) 1.0 >>> print(recall(reference_set, test_set)) 0.8 >>> print(f_measure(reference_set, test_set)) 0.88888888888... Measuring the likelihood of the data, given probability distributions: >>> from nltk import FreqDist, MLEProbDist >>> pdist1 = MLEProbDist(FreqDist("aldjfalskfjaldsf")) >>> pdist2 = MLEProbDist(FreqDist("aldjfalssjjlldss")) >>> print(log_likelihood(['a', 'd'], [pdist1, pdist2])) -2.7075187496... ---------------- Distance Metrics ---------------- String edit distance (Levenshtein): >>> edit_distance("rain", "shine") 3 Other distance measures: >>> s1 = set([1,2,3,4]) >>> s2 = set([3,4,5]) >>> binary_distance(s1, s2) 1.0 >>> print(jaccard_distance(s1, s2)) 0.6 >>> print(masi_distance(s1, s2)) 0.868... ---------------------- Miscellaneous Measures ---------------------- Rank Correlation works with two dictionaries mapping keys to ranks. The dictionaries should have the same set of keys. >>> spearman_correlation({'e':1, 't':2, 'a':3}, {'e':1, 'a':2, 't':3}) 0.5 Windowdiff uses a sliding window in comparing two segmentations of the same input (e.g. tokenizations, chunkings). Segmentations are represented using strings of zeros and ones. >>> s1 = "000100000010" >>> s2 = "000010000100" >>> s3 = "100000010000" >>> s4 = "000000000000" >>> s5 = "111111111111" >>> windowdiff(s1, s1, 3) 0.0 >>> abs(windowdiff(s1, s2, 3) - 0.3) < 1e-6 # windowdiff(s1, s2, 3) == 0.3 True >>> abs(windowdiff(s2, s3, 3) - 0.8) < 1e-6 # windowdiff(s2, s3, 3) == 0.8 True >>> windowdiff(s1, s4, 3) 0.5 >>> windowdiff(s1, s5, 3) 1.0 ---------------- Confusion Matrix ---------------- >>> reference = 'This is the reference data. Testing 123. aoaeoeoe' >>> test = 'Thos iz_the rifirenci data. Testeng 123. aoaeoeoe' >>> print(ConfusionMatrix(reference, test)) | . 1 2 3 T _ a c d e f g h i n o r s t z | --+-------------------------------------------+ |<8>. . . . . 1 . . . . . . . . . . . . . . | . | .<2>. . . . . . . . . . . . . . . . . . . | 1 | . .<1>. . . . . . . . . . . . . . . . . . | 2 | . . .<1>. . . . . . . . . . . . . . . . . | 3 | . . . .<1>. . . . . . . . . . . . . . . . | T | . . . . .<2>. . . . . . . . . . . . . . . | _ | . . . . . .<.>. . . . . . . . . . . . . . | a | . . . . . . .<4>. . . . . . . . . . . . . | c | . . . . . . . .<1>. . . . . . . . . . . . | d | . . . . . . . . .<1>. . . . . . . . . . . | e | . . . . . . . . . .<6>. . . 3 . . . . . . | f | . . . . . . . . . . .<1>. . . . . . . . . | g | . . . . . . . . . . . .<1>. . . . . . . . | h | . . . . . . . . . . . . .<2>. . . . . . . | i | . . . . . . . . . . 1 . . .<1>. 1 . . . . | n | . . . . . . . . . . . . . . .<2>. . . . . | o | . . . . . . . . . . . . . . . .<3>. . . . | r | . . . . . . . . . . . . . . . . .<2>. . . | s | . . . . . . . . . . . . . . . . . .<2>. 1 | t | . . . . . . . . . . . . . . . . . . .<3>. | z | . . . . . . . . . . . . . . . . . . . .<.>| --+-------------------------------------------+ (row = reference; col = test) >>> cm = ConfusionMatrix(reference, test) >>> print(cm.pretty_format(sort_by_count=True)) | e a i o s t . T h n r 1 2 3 c d f g _ z | --+-------------------------------------------+ |<8>. . . . . . . . . . . . . . . . . . 1 . | e | .<6>. 3 . . . . . . . . . . . . . . . . . | a | . .<4>. . . . . . . . . . . . . . . . . . | i | . 1 .<1>1 . . . . . . . . . . . . . . . . | o | . . . .<3>. . . . . . . . . . . . . . . . | s | . . . . .<2>. . . . . . . . . . . . . . 1 | t | . . . . . .<3>. . . . . . . . . . . . . . | . | . . . . . . .<2>. . . . . . . . . . . . . | T | . . . . . . . .<2>. . . . . . . . . . . . | h | . . . . . . . . .<2>. . . . . . . . . . . | n | . . . . . . . . . .<2>. . . . . . . . . . | r | . . . . . . . . . . .<2>. . . . . . . . . | 1 | . . . . . . . . . . . .<1>. . . . . . . . | 2 | . . . . . . . . . . . . .<1>. . . . . . . | 3 | . . . . . . . . . . . . . .<1>. . . . . . | c | . . . . . . . . . . . . . . .<1>. . . . . | d | . . . . . . . . . . . . . . . .<1>. . . . | f | . . . . . . . . . . . . . . . . .<1>. . . | g | . . . . . . . . . . . . . . . . . .<1>. . | _ | . . . . . . . . . . . . . . . . . . .<.>. | z | . . . . . . . . . . . . . . . . . . . .<.>| --+-------------------------------------------+ (row = reference; col = test) >>> print(cm.pretty_format(sort_by_count=True, truncate=10)) | e a i o s t . T h | --+---------------------+ |<8>. . . . . . . . . | e | .<6>. 3 . . . . . . | a | . .<4>. . . . . . . | i | . 1 .<1>1 . . . . . | o | . . . .<3>. . . . . | s | . . . . .<2>. . . . | t | . . . . . .<3>. . . | . | . . . . . . .<2>. . | T | . . . . . . . .<2>. | h | . . . . . . . . .<2>| --+---------------------+ (row = reference; col = test) >>> print(cm.pretty_format(sort_by_count=True, truncate=10, values_in_chart=False)) | 1 | | 1 2 3 4 5 6 7 8 9 0 | ---+---------------------+ 1 |<8>. . . . . . . . . | 2 | .<6>. 3 . . . . . . | 3 | . .<4>. . . . . . . | 4 | . 1 .<1>1 . . . . . | 5 | . . . .<3>. . . . . | 6 | . . . . .<2>. . . . | 7 | . . . . . .<3>. . . | 8 | . . . . . . .<2>. . | 9 | . . . . . . . .<2>. | 10 | . . . . . . . . .<2>| ---+---------------------+ (row = reference; col = test) Value key: 1: 2: e 3: a 4: i 5: o 6: s 7: t 8: . 9: T 10: h -------------------- Association measures -------------------- These measures are useful to determine whether the coocurrence of two random events is meaningful. They are used, for instance, to distinguish collocations from other pairs of adjacent words. We bring some examples of bigram association calculations from Manning and Schutze's SNLP, 2nd Ed. chapter 5. >>> n_new_companies, n_new, n_companies, N = 8, 15828, 4675, 14307668 >>> bam = BigramAssocMeasures >>> bam.raw_freq(20, (42, 20), N) == 20. / N True >>> bam.student_t(n_new_companies, (n_new, n_companies), N) 0.999... >>> bam.chi_sq(n_new_companies, (n_new, n_companies), N) 1.54... >>> bam.likelihood_ratio(150, (12593, 932), N) 1291... For other associations, we ensure the ordering of the measures: >>> bam.mi_like(20, (42, 20), N) > bam.mi_like(20, (41, 27), N) True >>> bam.pmi(20, (42, 20), N) > bam.pmi(20, (41, 27), N) True >>> bam.phi_sq(20, (42, 20), N) > bam.phi_sq(20, (41, 27), N) True >>> bam.poisson_stirling(20, (42, 20), N) > bam.poisson_stirling(20, (41, 27), N) True >>> bam.jaccard(20, (42, 20), N) > bam.jaccard(20, (41, 27), N) True >>> bam.dice(20, (42, 20), N) > bam.dice(20, (41, 27), N) True >>> bam.fisher(20, (42, 20), N) > bam.fisher(20, (41, 27), N) False For trigrams, we have to provide more count information: >>> n_w1_w2_w3 = 20 >>> n_w1_w2, n_w1_w3, n_w2_w3 = 35, 60, 40 >>> pair_counts = (n_w1_w2, n_w1_w3, n_w2_w3) >>> n_w1, n_w2, n_w3 = 100, 200, 300 >>> uni_counts = (n_w1, n_w2, n_w3) >>> N = 14307668 >>> tam = TrigramAssocMeasures >>> tam.raw_freq(n_w1_w2_w3, pair_counts, uni_counts, N) == 1. * n_w1_w2_w3 / N True >>> uni_counts2 = (n_w1, n_w2, 100) >>> tam.student_t(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.student_t(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.chi_sq(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.mi_like(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.pmi(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.pmi(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.likelihood_ratio(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.poisson_stirling(n_w1_w2_w3, pair_counts, uni_counts, N) True >>> tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts2, N) > tam.jaccard(n_w1_w2_w3, pair_counts, uni_counts, N) True nltk-3.1/nltk/test/misc.doctest0000644000076500000240000000642112607224144016332 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT -------------------------------------------------------------------------------- Unit tests for the miscellaneous sort functions. -------------------------------------------------------------------------------- >>> from copy import deepcopy >>> from nltk.misc.sort import * A (very) small list of unsorted integers. >>> test_data = [12, 67, 7, 28, 92, 56, 53, 720, 91, 57, 20, 20] Test each sorting method - each method returns the number of operations required to sort the data, and sorts in-place (desctructively - hence the need for multiple copies). >>> sorted_data = deepcopy(test_data) >>> selection(sorted_data) 66 >>> sorted_data [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] >>> sorted_data = deepcopy(test_data) >>> bubble(sorted_data) 30 >>> sorted_data [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] >>> sorted_data = deepcopy(test_data) >>> merge(sorted_data) 30 >>> sorted_data [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] >>> sorted_data = deepcopy(test_data) >>> quick(sorted_data) 13 >>> sorted_data [7, 12, 20, 20, 28, 53, 56, 57, 67, 91, 92, 720] -------------------------------------------------------------------------------- Unit tests for Wordfinder class -------------------------------------------------------------------------------- >>> import random >>> # The following is not enough for reproducibility under Python 2/3 >>> # (see http://bugs.python.org/issue9025) so this test is skipped. >>> random.seed(12345) >>> from nltk.misc import wordfinder >>> wordfinder.word_finder() # doctest: +SKIP Word Finder J V L A I R O T A T I S I V O D E R E T H U U B E A R O E P O C S O R E T N E P A D A U Z E E S R A P P A L L M E N T R C X A D Q S Z T P E O R S N G P J A D E I G Y K K T I A A R G F I D T E L C N S R E C N B H T R L T N N B W N T A O A I A Y I L O E I A M E I A A Y U R P L L D G L T V S T S F E A D I P H D O O H N I R L S E C I N I L R N N M E C G R U E A A A Y G I C E N L L E O I G Q R T A E L M R C E T I S T A E T L L E U A E N R L O U O T A S E E C S O O N H Y P A T G Y E M H O M M D R E S F P U L T H C F N V L A C A I M A M A N L B R U T E D O M I O R I L N E E E E E U A R S C R Y L I P H T R K E S N N M S I L A S R E V I N U T X T A A O U T K S E T A R R E S I B J A E D L E L J I F O O R P E L K N I R W K H A I D E Q O P R I C K T I M B E R P Z K D O O H G N I H T U R V E Y D R O P 1: INTERCHANGER 2: TEARLESSNESS 3: UNIVERSALISM 4: DESENSITIZER 5: INTERMENTION 6: TRICHOCYSTIC 7: EXTRAMURALLY 8: VEGETOALKALI 9: PALMELLACEAE 10: AESTHETICISM 11: PETROGRAPHER 12: VISITATORIAL 13: OLEOMARGARIC 14: WRINKLEPROOF 15: PRICKTIMBER 16: PRESIDIALLY 17: SCITAMINEAE 18: ENTEROSCOPE 19: APPALLMENT 20: TURVEYDROP 21: THINGHOOD 22: BISERRATE 23: GREENLAND 24: BRUTEDOM 25: POLONIAN 26: ACOLHUAN 27: LAPORTEA 28: TENDING 29: TEREDO 30: MESOLE 31: UNLIMP 32: OSTARA 33: PILY 34: DUNT 35: ONYX 36: KATH 37: JUNE nltk-3.1/nltk/test/nonmonotonic.doctest0000644000076500000240000002340412607224144020117 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ====================== Nonmonotonic Reasoning ====================== >>> from nltk import * >>> from nltk.inference.nonmonotonic import * >>> from nltk.sem import logic >>> logic._counter._value = 0 >>> read_expr = logic.Expression.fromstring ------------------------ Closed Domain Assumption ------------------------ The only entities in the domain are those found in the assumptions or goal. If the domain only contains "A" and "B", then the expression "exists x.P(x)" can be replaced with "P(A) | P(B)" and an expression "all x.P(x)" can be replaced with "P(A) & P(B)". >>> p1 = read_expr(r'all x.(man(x) -> mortal(x))') >>> p2 = read_expr(r'man(Socrates)') >>> c = read_expr(r'mortal(Socrates)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() True >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP (man(Socrates) -> mortal(Socrates)) man(Socrates) >>> cdp.prove() True >>> p1 = read_expr(r'exists x.walk(x)') >>> p2 = read_expr(r'man(Socrates)') >>> c = read_expr(r'walk(Socrates)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() False >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP walk(Socrates) man(Socrates) >>> cdp.prove() True >>> p1 = read_expr(r'exists x.walk(x)') >>> p2 = read_expr(r'man(Socrates)') >>> p3 = read_expr(r'-walk(Bill)') >>> c = read_expr(r'walk(Socrates)') >>> prover = Prover9Command(c, [p1,p2,p3]) >>> prover.prove() False >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP (walk(Socrates) | walk(Bill)) man(Socrates) -walk(Bill) >>> cdp.prove() True >>> p1 = read_expr(r'walk(Socrates)') >>> p2 = read_expr(r'walk(Bill)') >>> c = read_expr(r'all x.walk(x)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() False >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP walk(Socrates) walk(Bill) >>> print(cdp.goal()) # doctest: +SKIP (walk(Socrates) & walk(Bill)) >>> cdp.prove() True >>> p1 = read_expr(r'girl(mary)') >>> p2 = read_expr(r'dog(rover)') >>> p3 = read_expr(r'all x.(girl(x) -> -dog(x))') >>> p4 = read_expr(r'all x.(dog(x) -> -girl(x))') >>> p5 = read_expr(r'chase(mary, rover)') >>> c = read_expr(r'exists y.(dog(y) & all x.(girl(x) -> chase(x,y)))') >>> prover = Prover9Command(c, [p1,p2,p3,p4,p5]) >>> print(prover.prove()) False >>> cdp = ClosedDomainProver(prover) >>> for a in cdp.assumptions(): print(a) # doctest: +SKIP girl(mary) dog(rover) ((girl(rover) -> -dog(rover)) & (girl(mary) -> -dog(mary))) ((dog(rover) -> -girl(rover)) & (dog(mary) -> -girl(mary))) chase(mary,rover) >>> print(cdp.goal()) # doctest: +SKIP ((dog(rover) & (girl(rover) -> chase(rover,rover)) & (girl(mary) -> chase(mary,rover))) | (dog(mary) & (girl(rover) -> chase(rover,mary)) & (girl(mary) -> chase(mary,mary)))) >>> print(cdp.prove()) True ----------------------- Unique Names Assumption ----------------------- No two entities in the domain represent the same entity unless it can be explicitly proven that they do. Therefore, if the domain contains "A" and "B", then add the assumption "-(A = B)" if it is not the case that " \|- (A = B)". >>> p1 = read_expr(r'man(Socrates)') >>> p2 = read_expr(r'man(Bill)') >>> c = read_expr(r'exists x.exists y.-(x = y)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() False >>> unp = UniqueNamesProver(prover) >>> for a in unp.assumptions(): print(a) # doctest: +SKIP man(Socrates) man(Bill) -(Socrates = Bill) >>> unp.prove() True >>> p1 = read_expr(r'all x.(walk(x) -> (x = Socrates))') >>> p2 = read_expr(r'Bill = William') >>> p3 = read_expr(r'Bill = Billy') >>> c = read_expr(r'-walk(William)') >>> prover = Prover9Command(c, [p1,p2,p3]) >>> prover.prove() False >>> unp = UniqueNamesProver(prover) >>> for a in unp.assumptions(): print(a) # doctest: +SKIP all x.(walk(x) -> (x = Socrates)) (Bill = William) (Bill = Billy) -(William = Socrates) -(Billy = Socrates) -(Socrates = Bill) >>> unp.prove() True ----------------------- Closed World Assumption ----------------------- The only entities that have certain properties are those that is it stated have the properties. We accomplish this assumption by "completing" predicates. If the assumptions contain "P(A)", then "all x.(P(x) -> (x=A))" is the completion of "P". If the assumptions contain "all x.(ostrich(x) -> bird(x))", then "all x.(bird(x) -> ostrich(x))" is the completion of "bird". If the assumptions don't contain anything that are "P", then "all x.-P(x)" is the completion of "P". >>> p1 = read_expr(r'walk(Socrates)') >>> p2 = read_expr(r'-(Socrates = Bill)') >>> c = read_expr(r'-walk(Bill)') >>> prover = Prover9Command(c, [p1,p2]) >>> prover.prove() False >>> cwp = ClosedWorldProver(prover) >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP walk(Socrates) -(Socrates = Bill) all z1.(walk(z1) -> (z1 = Socrates)) >>> cwp.prove() True >>> p1 = read_expr(r'see(Socrates, John)') >>> p2 = read_expr(r'see(John, Mary)') >>> p3 = read_expr(r'-(Socrates = John)') >>> p4 = read_expr(r'-(John = Mary)') >>> c = read_expr(r'-see(Socrates, Mary)') >>> prover = Prover9Command(c, [p1,p2,p3,p4]) >>> prover.prove() False >>> cwp = ClosedWorldProver(prover) >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP see(Socrates,John) see(John,Mary) -(Socrates = John) -(John = Mary) all z3 z4.(see(z3,z4) -> (((z3 = Socrates) & (z4 = John)) | ((z3 = John) & (z4 = Mary)))) >>> cwp.prove() True >>> p1 = read_expr(r'all x.(ostrich(x) -> bird(x))') >>> p2 = read_expr(r'bird(Tweety)') >>> p3 = read_expr(r'-ostrich(Sam)') >>> p4 = read_expr(r'Sam != Tweety') >>> c = read_expr(r'-bird(Sam)') >>> prover = Prover9Command(c, [p1,p2,p3,p4]) >>> prover.prove() False >>> cwp = ClosedWorldProver(prover) >>> for a in cwp.assumptions(): print(a) # doctest: +SKIP all x.(ostrich(x) -> bird(x)) bird(Tweety) -ostrich(Sam) -(Sam = Tweety) all z7.-ostrich(z7) all z8.(bird(z8) -> ((z8 = Tweety) | ostrich(z8))) >>> print(cwp.prove()) True ----------------------- Multi-Decorator Example ----------------------- Decorators can be nested to utilize multiple assumptions. >>> p1 = read_expr(r'see(Socrates, John)') >>> p2 = read_expr(r'see(John, Mary)') >>> c = read_expr(r'-see(Socrates, Mary)') >>> prover = Prover9Command(c, [p1,p2]) >>> print(prover.prove()) False >>> cmd = ClosedDomainProver(UniqueNamesProver(ClosedWorldProver(prover))) >>> print(cmd.prove()) True ----------------- Default Reasoning ----------------- >>> logic._counter._value = 0 >>> premises = [] define the taxonomy >>> premises.append(read_expr(r'all x.(elephant(x) -> animal(x))')) >>> premises.append(read_expr(r'all x.(bird(x) -> animal(x))')) >>> premises.append(read_expr(r'all x.(dove(x) -> bird(x))')) >>> premises.append(read_expr(r'all x.(ostrich(x) -> bird(x))')) >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> ostrich(x))')) default the properties using abnormalities >>> premises.append(read_expr(r'all x.((animal(x) & -Ab1(x)) -> -fly(x))')) #normal animals don't fly >>> premises.append(read_expr(r'all x.((bird(x) & -Ab2(x)) -> fly(x))')) #normal birds fly >>> premises.append(read_expr(r'all x.((ostrich(x) & -Ab3(x)) -> -fly(x))')) #normal ostriches don't fly specify abnormal entities >>> premises.append(read_expr(r'all x.(bird(x) -> Ab1(x))')) #flight >>> premises.append(read_expr(r'all x.(ostrich(x) -> Ab2(x))')) #non-flying bird >>> premises.append(read_expr(r'all x.(flying_ostrich(x) -> Ab3(x))')) #flying ostrich define entities >>> premises.append(read_expr(r'elephant(el)')) >>> premises.append(read_expr(r'dove(do)')) >>> premises.append(read_expr(r'ostrich(os)')) print the augmented assumptions list >>> prover = Prover9Command(None, premises) >>> command = UniqueNamesProver(ClosedWorldProver(prover)) >>> for a in command.assumptions(): print(a) # doctest: +SKIP all x.(elephant(x) -> animal(x)) all x.(bird(x) -> animal(x)) all x.(dove(x) -> bird(x)) all x.(ostrich(x) -> bird(x)) all x.(flying_ostrich(x) -> ostrich(x)) all x.((animal(x) & -Ab1(x)) -> -fly(x)) all x.((bird(x) & -Ab2(x)) -> fly(x)) all x.((ostrich(x) & -Ab3(x)) -> -fly(x)) all x.(bird(x) -> Ab1(x)) all x.(ostrich(x) -> Ab2(x)) all x.(flying_ostrich(x) -> Ab3(x)) elephant(el) dove(do) ostrich(os) all z1.(animal(z1) -> (elephant(z1) | bird(z1))) all z2.(Ab1(z2) -> bird(z2)) all z3.(bird(z3) -> (dove(z3) | ostrich(z3))) all z4.(dove(z4) -> (z4 = do)) all z5.(Ab2(z5) -> ostrich(z5)) all z6.(Ab3(z6) -> flying_ostrich(z6)) all z7.(ostrich(z7) -> ((z7 = os) | flying_ostrich(z7))) all z8.-flying_ostrich(z8) all z9.(elephant(z9) -> (z9 = el)) -(el = os) -(el = do) -(os = do) >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(el)'), premises))).prove() True >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('fly(do)'), premises))).prove() True >>> UniqueNamesProver(ClosedWorldProver(Prover9Command(read_expr('-fly(os)'), premises))).prove() True nltk-3.1/nltk/test/nonmonotonic_fixt.py0000644000076500000240000000052212574600335020133 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import def setup_module(module): from nose import SkipTest from nltk.inference.mace import Mace try: m = Mace() m._find_binary('mace4') except LookupError: raise SkipTest("Mace4/Prover9 is not available so nonmonotonic.doctest was skipped") nltk-3.1/nltk/test/paice.doctest0000644000076500000240000000232612607224144016460 0ustar sbstaff00000000000000 ===================================================== PAICE's evaluation statistics for stemming algorithms ===================================================== Given a list of words with their real lemmas and stems according to stemming algorithm under evaluation, counts Understemming Index (UI), Overstemming Index (OI), Stemming Weight (SW) and Error-rate relative to truncation (ERRT). >>> from nltk.metrics import Paice ------------------------------------- Understemming and Overstemming values ------------------------------------- >>> lemmas = {'kneel': ['kneel', 'knelt'], ... 'range': ['range', 'ranged'], ... 'ring': ['ring', 'rang', 'rung']} >>> stems = {'kneel': ['kneel'], ... 'knelt': ['knelt'], ... 'rang': ['rang', 'range', 'ranged'], ... 'ring': ['ring'], ... 'rung': ['rung']} >>> p = Paice(lemmas, stems) >>> p.gumt, p.gdmt, p.gwmt, p.gdnt (4.0, 5.0, 2.0, 16.0) >>> p.ui, p.oi, p.sw (0.8..., 0.125..., 0.15625...) >>> p.errt 1.0 >>> [('{0:.3f}'.format(a), '{0:.3f}'.format(b)) for a, b in p.coords] [('0.000', '1.000'), ('0.000', '0.375'), ('0.600', '0.125'), ('0.800', '0.125')] nltk-3.1/nltk/test/parse.doctest0000644000076500000240000007656612607224144016532 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ========= Parsing ========= Unit tests for the Context Free Grammar class --------------------------------------------- >>> from nltk import Nonterminal, nonterminals, Production, CFG >>> nt1 = Nonterminal('NP') >>> nt2 = Nonterminal('VP') >>> nt1.symbol() 'NP' >>> nt1 == Nonterminal('NP') True >>> nt1 == nt2 False >>> S, NP, VP, PP = nonterminals('S, NP, VP, PP') >>> N, V, P, DT = nonterminals('N, V, P, DT') >>> prod1 = Production(S, [NP, VP]) >>> prod2 = Production(NP, [DT, NP]) >>> prod1.lhs() S >>> prod1.rhs() (NP, VP) >>> prod1 == Production(S, [NP, VP]) True >>> prod1 == prod2 False >>> grammar = CFG.fromstring(""" ... S -> NP VP ... PP -> P NP ... NP -> 'the' N | N PP | 'the' N PP ... VP -> V NP | V PP | V NP PP ... N -> 'cat' ... N -> 'dog' ... N -> 'rug' ... V -> 'chased' ... V -> 'sat' ... P -> 'in' ... P -> 'on' ... """) Unit tests for the rd (Recursive Descent Parser) class ------------------------------------------------------ Create and run a recursive descent parser over both a syntactically ambiguous and unambiguous sentence. >>> from nltk.parse import RecursiveDescentParser >>> rd = RecursiveDescentParser(grammar) >>> sentence1 = 'the cat chased the dog'.split() >>> sentence2 = 'the cat chased the dog on the rug'.split() >>> for t in rd.parse(sentence1): ... print(t) (S (NP the (N cat)) (VP (V chased) (NP the (N dog)))) >>> for t in rd.parse(sentence2): ... print(t) (S (NP the (N cat)) (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug)))))) (S (NP the (N cat)) (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug))))) (dolist (expr doctest-font-lock-keywords) (add-to-list 'font-lock-keywords expr)) font-lock-keywords (add-to-list 'font-lock-keywords (car doctest-font-lock-keywords)) Unit tests for the sr (Shift Reduce Parser) class ------------------------------------------------- Create and run a shift reduce parser over both a syntactically ambiguous and unambiguous sentence. Note that unlike the recursive descent parser, one and only one parse is ever returned. >>> from nltk.parse import ShiftReduceParser >>> sr = ShiftReduceParser(grammar) >>> sentence1 = 'the cat chased the dog'.split() >>> sentence2 = 'the cat chased the dog on the rug'.split() >>> for t in sr.parse(sentence1): ... print(t) (S (NP the (N cat)) (VP (V chased) (NP the (N dog)))) The shift reduce parser uses heuristics to decide what to do when there are multiple possible shift or reduce operations available - for the supplied grammar clearly the wrong operation is selected. >>> for t in sr.parse(sentence2): ... print(t) Unit tests for the Chart Parser class ------------------------------------- We use the demo() function for testing. We must turn off showing of times. >>> import nltk First we test tracing with a short sentence >>> nltk.parse.chart.demo(2, print_times=False, trace=1, ... sent='I saw a dog', numparses=1) * Sentence: I saw a dog ['I', 'saw', 'a', 'dog'] * Strategy: Bottom-up |. I . saw . a . dog .| |[---------] . . .| [0:1] 'I' |. [---------] . .| [1:2] 'saw' |. . [---------] .| [2:3] 'a' |. . . [---------]| [3:4] 'dog' |> . . . .| [0:0] NP -> * 'I' |[---------] . . .| [0:1] NP -> 'I' * |> . . . .| [0:0] S -> * NP VP |> . . . .| [0:0] NP -> * NP PP |[---------> . . .| [0:1] S -> NP * VP |[---------> . . .| [0:1] NP -> NP * PP |. > . . .| [1:1] Verb -> * 'saw' |. [---------] . .| [1:2] Verb -> 'saw' * |. > . . .| [1:1] VP -> * Verb NP |. > . . .| [1:1] VP -> * Verb |. [---------> . .| [1:2] VP -> Verb * NP |. [---------] . .| [1:2] VP -> Verb * |. > . . .| [1:1] VP -> * VP PP |[-------------------] . .| [0:2] S -> NP VP * |. [---------> . .| [1:2] VP -> VP * PP |. . > . .| [2:2] Det -> * 'a' |. . [---------] .| [2:3] Det -> 'a' * |. . > . .| [2:2] NP -> * Det Noun |. . [---------> .| [2:3] NP -> Det * Noun |. . . > .| [3:3] Noun -> * 'dog' |. . . [---------]| [3:4] Noun -> 'dog' * |. . [-------------------]| [2:4] NP -> Det Noun * |. . > . .| [2:2] S -> * NP VP |. . > . .| [2:2] NP -> * NP PP |. [-----------------------------]| [1:4] VP -> Verb NP * |. . [------------------->| [2:4] S -> NP * VP |. . [------------------->| [2:4] NP -> NP * PP |[=======================================]| [0:4] S -> NP VP * |. [----------------------------->| [1:4] VP -> VP * PP Nr edges in chart: 33 (S (NP I) (VP (Verb saw) (NP (Det a) (Noun dog)))) Then we test the different parsing Strategies. Note that the number of edges differ between the strategies. Top-down >>> nltk.parse.chart.demo(1, print_times=False, trace=0, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Top-down Nr edges in chart: 48 (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) Bottom-up >>> nltk.parse.chart.demo(2, print_times=False, trace=0, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Bottom-up Nr edges in chart: 53 (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) Bottom-up Left-Corner >>> nltk.parse.chart.demo(3, print_times=False, trace=0, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Bottom-up left-corner Nr edges in chart: 36 (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) Left-Corner with Bottom-Up Filter >>> nltk.parse.chart.demo(4, print_times=False, trace=0, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Filtered left-corner Nr edges in chart: 28 (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) The stepping chart parser >>> nltk.parse.chart.demo(5, print_times=False, trace=1, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] * Strategy: Stepping (top-down vs bottom-up) *** SWITCH TO TOP DOWN |[------] . . . . .| [0:1] 'I' |. [------] . . . .| [1:2] 'saw' |. . [------] . . .| [2:3] 'John' |. . . [------] . .| [3:4] 'with' |. . . . [------] .| [4:5] 'a' |. . . . . [------]| [5:6] 'dog' |> . . . . . .| [0:0] S -> * NP VP |> . . . . . .| [0:0] NP -> * NP PP |> . . . . . .| [0:0] NP -> * Det Noun |> . . . . . .| [0:0] NP -> * 'I' |[------] . . . . .| [0:1] NP -> 'I' * |[------> . . . . .| [0:1] S -> NP * VP |[------> . . . . .| [0:1] NP -> NP * PP |. > . . . . .| [1:1] VP -> * VP PP |. > . . . . .| [1:1] VP -> * Verb NP |. > . . . . .| [1:1] VP -> * Verb |. > . . . . .| [1:1] Verb -> * 'saw' |. [------] . . . .| [1:2] Verb -> 'saw' * |. [------> . . . .| [1:2] VP -> Verb * NP |. [------] . . . .| [1:2] VP -> Verb * |[-------------] . . . .| [0:2] S -> NP VP * |. [------> . . . .| [1:2] VP -> VP * PP *** SWITCH TO BOTTOM UP |. . > . . . .| [2:2] NP -> * 'John' |. . . > . . .| [3:3] PP -> * 'with' NP |. . . > . . .| [3:3] Prep -> * 'with' |. . . . > . .| [4:4] Det -> * 'a' |. . . . . > .| [5:5] Noun -> * 'dog' |. . [------] . . .| [2:3] NP -> 'John' * |. . . [------> . .| [3:4] PP -> 'with' * NP |. . . [------] . .| [3:4] Prep -> 'with' * |. . . . [------] .| [4:5] Det -> 'a' * |. . . . . [------]| [5:6] Noun -> 'dog' * |. [-------------] . . .| [1:3] VP -> Verb NP * |[--------------------] . . .| [0:3] S -> NP VP * |. [-------------> . . .| [1:3] VP -> VP * PP |. . > . . . .| [2:2] S -> * NP VP |. . > . . . .| [2:2] NP -> * NP PP |. . . . > . .| [4:4] NP -> * Det Noun |. . [------> . . .| [2:3] S -> NP * VP |. . [------> . . .| [2:3] NP -> NP * PP |. . . . [------> .| [4:5] NP -> Det * Noun |. . . . [-------------]| [4:6] NP -> Det Noun * |. . . [--------------------]| [3:6] PP -> 'with' NP * |. [----------------------------------]| [1:6] VP -> VP PP * *** SWITCH TO TOP DOWN |. . > . . . .| [2:2] NP -> * Det Noun |. . . . > . .| [4:4] NP -> * NP PP |. . . > . . .| [3:3] VP -> * VP PP |. . . > . . .| [3:3] VP -> * Verb NP |. . . > . . .| [3:3] VP -> * Verb |[=========================================]| [0:6] S -> NP VP * |. [---------------------------------->| [1:6] VP -> VP * PP |. . [---------------------------]| [2:6] NP -> NP PP * |. . . . [------------->| [4:6] NP -> NP * PP |. [----------------------------------]| [1:6] VP -> Verb NP * |. . [--------------------------->| [2:6] S -> NP * VP |. . [--------------------------->| [2:6] NP -> NP * PP |[=========================================]| [0:6] S -> NP VP * |. [---------------------------------->| [1:6] VP -> VP * PP |. . . . . . >| [6:6] VP -> * VP PP |. . . . . . >| [6:6] VP -> * Verb NP |. . . . . . >| [6:6] VP -> * Verb *** SWITCH TO BOTTOM UP |. . . . > . .| [4:4] S -> * NP VP |. . . . [------------->| [4:6] S -> NP * VP *** SWITCH TO TOP DOWN *** SWITCH TO BOTTOM UP *** SWITCH TO TOP DOWN *** SWITCH TO BOTTOM UP *** SWITCH TO TOP DOWN *** SWITCH TO BOTTOM UP Nr edges in chart: 61 (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) Unit tests for the Incremental Chart Parser class ------------------------------------------------- The incremental chart parsers are defined in earleychart.py. We use the demo() function for testing. We must turn off showing of times. >>> import nltk Earley Chart Parser >>> nltk.parse.earleychart.demo(print_times=False, trace=1, ... sent='I saw John with a dog', numparses=2) * Sentence: I saw John with a dog ['I', 'saw', 'John', 'with', 'a', 'dog'] |. I . saw . John . with . a . dog .| |[------] . . . . .| [0:1] 'I' |. [------] . . . .| [1:2] 'saw' |. . [------] . . .| [2:3] 'John' |. . . [------] . .| [3:4] 'with' |. . . . [------] .| [4:5] 'a' |. . . . . [------]| [5:6] 'dog' |> . . . . . .| [0:0] S -> * NP VP |> . . . . . .| [0:0] NP -> * NP PP |> . . . . . .| [0:0] NP -> * Det Noun |> . . . . . .| [0:0] NP -> * 'I' |[------] . . . . .| [0:1] NP -> 'I' * |[------> . . . . .| [0:1] S -> NP * VP |[------> . . . . .| [0:1] NP -> NP * PP |. > . . . . .| [1:1] VP -> * VP PP |. > . . . . .| [1:1] VP -> * Verb NP |. > . . . . .| [1:1] VP -> * Verb |. > . . . . .| [1:1] Verb -> * 'saw' |. [------] . . . .| [1:2] Verb -> 'saw' * |. [------> . . . .| [1:2] VP -> Verb * NP |. [------] . . . .| [1:2] VP -> Verb * |[-------------] . . . .| [0:2] S -> NP VP * |. [------> . . . .| [1:2] VP -> VP * PP |. . > . . . .| [2:2] NP -> * NP PP |. . > . . . .| [2:2] NP -> * Det Noun |. . > . . . .| [2:2] NP -> * 'John' |. . [------] . . .| [2:3] NP -> 'John' * |. [-------------] . . .| [1:3] VP -> Verb NP * |. . [------> . . .| [2:3] NP -> NP * PP |. . . > . . .| [3:3] PP -> * 'with' NP |[--------------------] . . .| [0:3] S -> NP VP * |. [-------------> . . .| [1:3] VP -> VP * PP |. . . [------> . .| [3:4] PP -> 'with' * NP |. . . . > . .| [4:4] NP -> * NP PP |. . . . > . .| [4:4] NP -> * Det Noun |. . . . > . .| [4:4] Det -> * 'a' |. . . . [------] .| [4:5] Det -> 'a' * |. . . . [------> .| [4:5] NP -> Det * Noun |. . . . . > .| [5:5] Noun -> * 'dog' |. . . . . [------]| [5:6] Noun -> 'dog' * |. . . . [-------------]| [4:6] NP -> Det Noun * |. . . [--------------------]| [3:6] PP -> 'with' NP * |. . . . [------------->| [4:6] NP -> NP * PP |. . [---------------------------]| [2:6] NP -> NP PP * |. [----------------------------------]| [1:6] VP -> VP PP * |[=========================================]| [0:6] S -> NP VP * |. [---------------------------------->| [1:6] VP -> VP * PP |. [----------------------------------]| [1:6] VP -> Verb NP * |. . [--------------------------->| [2:6] NP -> NP * PP |[=========================================]| [0:6] S -> NP VP * |. [---------------------------------->| [1:6] VP -> VP * PP (S (NP I) (VP (VP (Verb saw) (NP John)) (PP with (NP (Det a) (Noun dog))))) (S (NP I) (VP (Verb saw) (NP (NP John) (PP with (NP (Det a) (Noun dog)))))) Unit tests for LARGE context-free grammars ------------------------------------------ Reading the ATIS grammar. >>> grammar = nltk.data.load('grammars/large_grammars/atis.cfg') >>> grammar Reading the test sentences. >>> sentences = nltk.data.load('grammars/large_grammars/atis_sentences.txt') >>> sentences = nltk.parse.util.extract_test_sentences(sentences) >>> len(sentences) 98 >>> testsentence = sentences[22] >>> testsentence[0] ['show', 'me', 'northwest', 'flights', 'to', 'detroit', '.'] >>> testsentence[1] 17 >>> sentence = testsentence[0] Now we test all different parsing strategies. Note that the number of edges differ between the strategies. Bottom-up parsing. >>> parser = nltk.parse.BottomUpChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 7661 >>> print((len(list(chart.parses(grammar.start()))))) 17 Bottom-up Left-corner parsing. >>> parser = nltk.parse.BottomUpLeftCornerChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 4986 >>> print((len(list(chart.parses(grammar.start()))))) 17 Left-corner parsing with bottom-up filter. >>> parser = nltk.parse.LeftCornerChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 1342 >>> print((len(list(chart.parses(grammar.start()))))) 17 Top-down parsing. >>> parser = nltk.parse.TopDownChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 28352 >>> print((len(list(chart.parses(grammar.start()))))) 17 Incremental Bottom-up parsing. >>> parser = nltk.parse.IncrementalBottomUpChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 7661 >>> print((len(list(chart.parses(grammar.start()))))) 17 Incremental Bottom-up Left-corner parsing. >>> parser = nltk.parse.IncrementalBottomUpLeftCornerChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 4986 >>> print((len(list(chart.parses(grammar.start()))))) 17 Incremental Left-corner parsing with bottom-up filter. >>> parser = nltk.parse.IncrementalLeftCornerChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 1342 >>> print((len(list(chart.parses(grammar.start()))))) 17 Incremental Top-down parsing. >>> parser = nltk.parse.IncrementalTopDownChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 28352 >>> print((len(list(chart.parses(grammar.start()))))) 17 Earley parsing. This is similar to the incremental top-down algorithm. >>> parser = nltk.parse.EarleyChartParser(grammar) >>> chart = parser.chart_parse(sentence) >>> print((chart.num_edges())) 28352 >>> print((len(list(chart.parses(grammar.start()))))) 17 Unit tests for the Probabilistic CFG class ------------------------------------------ >>> from nltk.corpus import treebank >>> from itertools import islice >>> from nltk.grammar import PCFG, induce_pcfg, toy_pcfg1, toy_pcfg2 Create a set of PCFG productions. >>> grammar = PCFG.fromstring(""" ... A -> B B [.3] | C B C [.7] ... B -> B D [.5] | C [.5] ... C -> 'a' [.1] | 'b' [0.9] ... D -> 'b' [1.0] ... """) >>> prod = grammar.productions()[0] >>> prod A -> B B [0.3] >>> prod.lhs() A >>> prod.rhs() (B, B) >>> print((prod.prob())) 0.3 >>> grammar.start() A >>> grammar.productions() [A -> B B [0.3], A -> C B C [0.7], B -> B D [0.5], B -> C [0.5], C -> 'a' [0.1], C -> 'b' [0.9], D -> 'b' [1.0]] Induce some productions using parsed Treebank data. >>> productions = [] >>> for fileid in treebank.fileids()[:2]: ... for t in treebank.parsed_sents(fileid): ... productions += t.productions() >>> grammar = induce_pcfg(S, productions) >>> grammar >>> sorted(grammar.productions(lhs=Nonterminal('PP')))[:2] [PP -> IN NP [1.0]] >>> sorted(grammar.productions(lhs=Nonterminal('NNP')))[:2] [NNP -> 'Agnew' [0.0714286], NNP -> 'Consolidated' [0.0714286]] >>> sorted(grammar.productions(lhs=Nonterminal('JJ')))[:2] [JJ -> 'British' [0.142857], JJ -> 'former' [0.142857]] >>> sorted(grammar.productions(lhs=Nonterminal('NP')))[:2] [NP -> CD NNS [0.133333], NP -> DT JJ JJ NN [0.0666667]] Unit tests for the Probabilistic Chart Parse classes ---------------------------------------------------- >>> tokens = "Jack saw Bob with my cookie".split() >>> grammar = toy_pcfg2 >>> print(grammar) Grammar with 23 productions (start state = S) S -> NP VP [1.0] VP -> V NP [0.59] VP -> V [0.4] VP -> VP PP [0.01] NP -> Det N [0.41] NP -> Name [0.28] NP -> NP PP [0.31] PP -> P NP [1.0] V -> 'saw' [0.21] V -> 'ate' [0.51] V -> 'ran' [0.28] N -> 'boy' [0.11] N -> 'cookie' [0.12] N -> 'table' [0.13] N -> 'telescope' [0.14] N -> 'hill' [0.5] Name -> 'Jack' [0.52] Name -> 'Bob' [0.48] P -> 'with' [0.61] P -> 'under' [0.39] Det -> 'the' [0.41] Det -> 'a' [0.31] Det -> 'my' [0.28] Create several parsers using different queuing strategies and show the resulting parses. >>> from nltk.parse import pchart >>> parser = pchart.InsideChartParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) (S (NP (Name Jack)) (VP (VP (V saw) (NP (Name Bob))) (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) >>> parser = pchart.RandomChartParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) (S (NP (Name Jack)) (VP (VP (V saw) (NP (Name Bob))) (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) >>> parser = pchart.UnsortedChartParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) (S (NP (Name Jack)) (VP (VP (V saw) (NP (Name Bob))) (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) >>> parser = pchart.LongestChartParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) (S (NP (Name Jack)) (VP (VP (V saw) (NP (Name Bob))) (PP (P with) (NP (Det my) (N cookie))))) (p=2.03744e-07) >>> parser = pchart.InsideChartParser(grammar, beam_size = len(tokens)+1) >>> for t in parser.parse(tokens): ... print(t) Unit tests for the Viterbi Parse classes ---------------------------------------- >>> from nltk.parse import ViterbiParser >>> tokens = "Jack saw Bob with my cookie".split() >>> grammar = toy_pcfg2 Parse the tokenized sentence. >>> parser = ViterbiParser(grammar) >>> for t in parser.parse(tokens): ... print(t) (S (NP (Name Jack)) (VP (V saw) (NP (NP (Name Bob)) (PP (P with) (NP (Det my) (N cookie)))))) (p=6.31607e-06) Unit tests for the FeatStructNonterminal class ---------------------------------------------- >>> from nltk.grammar import FeatStructNonterminal >>> FeatStructNonterminal( ... pos='n', agr=FeatStructNonterminal(number='pl', gender='f')) [agr=[gender='f', number='pl'], pos='n'] >>> FeatStructNonterminal('VP[+fin]/NP[+pl]') VP[+fin]/NP[+pl] Tracing the Feature Chart Parser -------------------------------- We use the featurechart.demo() function for tracing the Feature Chart Parser. >>> nltk.parse.featurechart.demo(print_times=False, ... print_grammar=True, ... parser=nltk.parse.featurechart.FeatureChartParser, ... sent='I saw John with a dog') Grammar with 18 productions (start state = S[]) S[] -> NP[] VP[] PP[] -> Prep[] NP[] NP[] -> NP[] PP[] VP[] -> VP[] PP[] VP[] -> Verb[] NP[] VP[] -> Verb[] NP[] -> Det[pl=?x] Noun[pl=?x] NP[] -> 'John' NP[] -> 'I' Det[] -> 'the' Det[] -> 'my' Det[-pl] -> 'a' Noun[-pl] -> 'dog' Noun[-pl] -> 'cookie' Verb[] -> 'ate' Verb[] -> 'saw' Prep[] -> 'with' Prep[] -> 'under' * FeatureChartParser Sentence: I saw John with a dog |.I.s.J.w.a.d.| |[-] . . . . .| [0:1] 'I' |. [-] . . . .| [1:2] 'saw' |. . [-] . . .| [2:3] 'John' |. . . [-] . .| [3:4] 'with' |. . . . [-] .| [4:5] 'a' |. . . . . [-]| [5:6] 'dog' |[-] . . . . .| [0:1] NP[] -> 'I' * |[-> . . . . .| [0:1] S[] -> NP[] * VP[] {} |[-> . . . . .| [0:1] NP[] -> NP[] * PP[] {} |. [-] . . . .| [1:2] Verb[] -> 'saw' * |. [-> . . . .| [1:2] VP[] -> Verb[] * NP[] {} |. [-] . . . .| [1:2] VP[] -> Verb[] * |. [-> . . . .| [1:2] VP[] -> VP[] * PP[] {} |[---] . . . .| [0:2] S[] -> NP[] VP[] * |. . [-] . . .| [2:3] NP[] -> 'John' * |. . [-> . . .| [2:3] S[] -> NP[] * VP[] {} |. . [-> . . .| [2:3] NP[] -> NP[] * PP[] {} |. [---] . . .| [1:3] VP[] -> Verb[] NP[] * |. [---> . . .| [1:3] VP[] -> VP[] * PP[] {} |[-----] . . .| [0:3] S[] -> NP[] VP[] * |. . . [-] . .| [3:4] Prep[] -> 'with' * |. . . [-> . .| [3:4] PP[] -> Prep[] * NP[] {} |. . . . [-] .| [4:5] Det[-pl] -> 'a' * |. . . . [-> .| [4:5] NP[] -> Det[pl=?x] * Noun[pl=?x] {?x: False} |. . . . . [-]| [5:6] Noun[-pl] -> 'dog' * |. . . . [---]| [4:6] NP[] -> Det[-pl] Noun[-pl] * |. . . . [--->| [4:6] S[] -> NP[] * VP[] {} |. . . . [--->| [4:6] NP[] -> NP[] * PP[] {} |. . . [-----]| [3:6] PP[] -> Prep[] NP[] * |. . [-------]| [2:6] NP[] -> NP[] PP[] * |. [---------]| [1:6] VP[] -> VP[] PP[] * |. [--------->| [1:6] VP[] -> VP[] * PP[] {} |[===========]| [0:6] S[] -> NP[] VP[] * |. . [------->| [2:6] S[] -> NP[] * VP[] {} |. . [------->| [2:6] NP[] -> NP[] * PP[] {} |. [---------]| [1:6] VP[] -> Verb[] NP[] * |. [--------->| [1:6] VP[] -> VP[] * PP[] {} |[===========]| [0:6] S[] -> NP[] VP[] * (S[] (NP[] I) (VP[] (VP[] (Verb[] saw) (NP[] John)) (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog))))) (S[] (NP[] I) (VP[] (Verb[] saw) (NP[] (NP[] John) (PP[] (Prep[] with) (NP[] (Det[-pl] a) (Noun[-pl] dog)))))) Unit tests for the Feature Chart Parser classes ----------------------------------------------- The list of parsers we want to test. >>> parsers = [nltk.parse.featurechart.FeatureChartParser, ... nltk.parse.featurechart.FeatureTopDownChartParser, ... nltk.parse.featurechart.FeatureBottomUpChartParser, ... nltk.parse.featurechart.FeatureBottomUpLeftCornerChartParser, ... nltk.parse.earleychart.FeatureIncrementalChartParser, ... nltk.parse.earleychart.FeatureEarleyChartParser, ... nltk.parse.earleychart.FeatureIncrementalTopDownChartParser, ... nltk.parse.earleychart.FeatureIncrementalBottomUpChartParser, ... nltk.parse.earleychart.FeatureIncrementalBottomUpLeftCornerChartParser, ... ] A helper function that tests each parser on the given grammar and sentence. We check that the number of trees are correct, and that all parsers return the same trees. Otherwise an error is printed. >>> def unittest(grammar, sentence, nr_trees): ... sentence = sentence.split() ... trees = None ... for P in parsers: ... result = P(grammar).parse(sentence) ... result = set(tree.freeze() for tree in result) ... if len(result) != nr_trees: ... print("Wrong nr of trees:", len(result)) ... elif trees is None: ... trees = result ... elif result != trees: ... print("Trees differ for parser:", P.__name__) The demo grammar from before, with an ambiguous sentence. >>> isawjohn = nltk.parse.featurechart.demo_grammar() >>> unittest(isawjohn, "I saw John with a dog with my cookie", 5) This grammar tests that variables in different grammar rules are renamed before unification. (The problematic variable is in this case ?X). >>> whatwasthat = nltk.grammar.FeatureGrammar.fromstring(''' ... S[] -> NP[num=?N] VP[num=?N, slash=?X] ... NP[num=?X] -> "what" ... NP[num=?X] -> "that" ... VP[num=?P, slash=none] -> V[num=?P] NP[] ... V[num=sg] -> "was" ... ''') >>> unittest(whatwasthat, "what was that", 1) This grammar tests that the same rule can be used in different places in another rule, and that the variables are properly renamed. >>> thislovesthat = nltk.grammar.FeatureGrammar.fromstring(''' ... S[] -> NP[case=nom] V[] NP[case=acc] ... NP[case=?X] -> Pron[case=?X] ... Pron[] -> "this" ... Pron[] -> "that" ... V[] -> "loves" ... ''') >>> unittest(thislovesthat, "this loves that", 1) Tests for loading feature grammar files --------------------------------------- Alternative 1: first load the grammar, then create the parser. >>> fcfg = nltk.data.load('grammars/book_grammars/feat0.fcfg') >>> fcp1 = nltk.parse.FeatureChartParser(fcfg) >>> print((type(fcp1))) Alternative 2: directly load the parser. >>> fcp2 = nltk.parse.load_parser('grammars/book_grammars/feat0.fcfg') >>> print((type(fcp2))) nltk-3.1/nltk/test/portuguese_en.doctest0000644000076500000240000005437512607224144020276 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ================================== Examples for Portuguese Processing ================================== This HOWTO contains a variety of examples relating to the Portuguese language. It is intended to be read in conjunction with the NLTK book (``http://nltk.org/book``). For instructions on running the Python interpreter, please see the section *Getting Started with Python*, in Chapter 1. -------------------------------------------- Python Programming, with Portuguese Examples -------------------------------------------- Chapter 1 of the NLTK book contains many elementary programming examples, all with English texts. In this section, we'll see some corresponding examples using Portuguese. Please refer to the chapter for full discussion. *Vamos!* >>> from nltk.examples.pt import * *** Introductory Examples for the NLTK Book *** Loading ptext1, ... and psent1, ... Type the name of the text or sentence to view it. Type: 'texts()' or 'sents()' to list the materials. ptext1: Memórias Póstumas de Brás Cubas (1881) ptext2: Dom Casmurro (1899) ptext3: Gênesis ptext4: Folha de Sao Paulo (1994) Any time we want to find out about these texts, we just have to enter their names at the Python prompt: >>> ptext2 Searching Text -------------- A concordance permits us to see words in context. >>> ptext1.concordance('olhos') Building index... Displaying 25 of 138 matches: De pé , à cabeceira da cama , com os olhos estúpidos , a boca entreaberta , a t orelhas . Pela minha parte fechei os olhos e deixei - me ir à ventura . Já agor xões de cérebro enfermo . Como ia de olhos fechados , não via o caminho ; lembr gelos eternos . Com efeito , abri os olhos e vi que o meu animal galopava numa me apareceu então , fitando - me uns olhos rutilantes como o sol . Tudo nessa f mim mesmo . Então , encarei - a com olhos súplices , e pedi mais alguns anos . ... For a given word, we can find words with a similar text distribution: >>> ptext1.similar('chegar') Building word-context index... acabada acudir aludir avistar bramanismo casamento cheguei com contar contrário corpo dali deixei desferirem dizer fazer filhos já leitor lhe >>> ptext3.similar('chegar') Building word-context index... achar alumiar arrombar destruir governar guardar ir lavrar passar que toda tomar ver vir We can search for the statistically significant collocations in a text: >>> ptext1.collocations() Building collocations list Quincas Borba; Lobo Neves; alguma coisa; Brás Cubas; meu pai; dia seguinte; não sei; Meu pai; alguns instantes; outra vez; outra coisa; por exemplo; mim mesmo; coisa nenhuma; mesma coisa; não era; dias depois; Passeio Público; olhar para; das coisas We can search for words in context, with the help of *regular expressions*, e.g.: >>> ptext1.findall(" (<.*>)") estúpidos; e; fechados; rutilantes; súplices; a; do; babavam; na; moles; se; da; umas; espraiavam; chamejantes; espetados; ... We can automatically generate random text based on a given text, e.g.: >>> ptext3.generate() # doctest: +SKIP No princípio , criou Deus os abençoou , dizendo : Onde { estão } e até à ave dos céus , { que } será . Disse mais Abrão : Dá - me a mulher que tomaste ; porque daquele poço Eseque , { tinha .} E disse : Não poderemos descer ; mas , do campo ainda não estava na casa do teu pescoço . E viveu Serugue , depois Simeão e Levi { são } estes ? E o varão , porque habitava na terra de Node , da mão de Esaú : Jeús , Jalão e Corá Texts as List of Words ---------------------- A few sentences have been defined for you. >>> psent1 ['o', 'amor', 'da', 'gl\xf3ria', 'era', 'a', 'coisa', 'mais', 'verdadeiramente', 'humana', 'que', 'h\xe1', 'no', 'homem', ',', 'e', ',', 'conseq\xfcentemente', ',', 'a', 'sua', 'mais', 'genu\xedna', 'fei\xe7\xe3o', '.'] >>> Notice that the sentence has been *tokenized*. Each token is represented as a string, represented using quotes, e.g. ``'coisa'``. Some strings contain special characters, e.g. ``\xf3``, the internal representation for ó. The tokens are combined in the form of a *list*. How long is this list? >>> len(psent1) 25 >>> What is the vocabulary of this sentence? >>> sorted(set(psent1)) [',', '.', 'a', 'amor', 'coisa', 'conseqüentemente', 'da', 'e', 'era', 'feição', 'genuína', 'glória', 'homem', 'humana', 'há', 'mais', 'no', 'o', 'que', 'sua', 'verdadeiramente'] >>> Let's iterate over each item in ``psent2``, and print information for each: >>> for w in psent2: ... print(w, len(w), w[-1]) ... Não 3 o consultes 9 s dicionários 11 s . 1 . Observe how we make a human-readable version of a string, using ``decode()``. Also notice that we accessed the last character of a string ``w`` using ``w[-1]``. We just saw a ``for`` loop above. Another useful control structure is a *list comprehension*. >>> [w.upper() for w in psent2] ['N\xc3O', 'CONSULTES', 'DICION\xc1RIOS', '.'] >>> [w for w in psent1 if w.endswith('a')] ['da', 'gl\xf3ria', 'era', 'a', 'coisa', 'humana', 'a', 'sua', 'genu\xedna'] >>> [w for w in ptext4 if len(w) > 15] [u'norte-irlandeses', u'pan-nacionalismo', u'predominatemente', u'primeiro-ministro', u'primeiro-ministro', u'irlandesa-americana', u'responsabilidades', u'significativamente'] We can examine the relative frequency of words in a text, using ``FreqDist``: >>> fd1 = FreqDist(ptext1) >>> fd1 >>> fd1['olhos'] 137 >>> fd1.max() u',' >>> fd1.samples()[:100] [u',', u'.', u'a', u'que', u'de', u'e', u'-', u'o', u';', u'me', u'um', u'n\xe3o', u'\x97', u'se', u'do', u'da', u'uma', u'com', u'os', u'\xe9', u'era', u'as', u'eu', u'lhe', u'ao', u'em', u'para', u'mas', u'...', u'!', u'\xe0', u'na', u'mais', u'?', u'no', u'como', u'por', u'N\xe3o', u'dos', u'ou', u'ele', u':', u'Virg\xedlia', u'meu', u'disse', u'minha', u'das', u'O', u'/', u'A', u'CAP\xcdTULO', u'muito', u'depois', u'coisa', u'foi', u'sem', u'olhos', u'ela', u'nos', u'tinha', u'nem', u'E', u'outro', u'vida', u'nada', u'tempo', u'menos', u'outra', u'casa', u'homem', u'porque', u'quando', u'mim', u'mesmo', u'ser', u'pouco', u'estava', u'dia', u't\xe3o', u'tudo', u'Mas', u'at\xe9', u'D', u'ainda', u's\xf3', u'alguma', u'la', u'vez', u'anos', u'h\xe1', u'Era', u'pai', u'esse', u'lo', u'dizer', u'assim', u'ent\xe3o', u'dizia', u'aos', u'Borba'] --------------- Reading Corpora --------------- Accessing the Machado Text Corpus --------------------------------- NLTK includes the complete works of Machado de Assis. >>> from nltk.corpus import machado >>> machado.fileids() ['contos/macn001.txt', 'contos/macn002.txt', 'contos/macn003.txt', ...] Each file corresponds to one of the works of Machado de Assis. To see a complete list of works, you can look at the corpus README file: ``print machado.readme()``. Let's access the text of the *Posthumous Memories of Brás Cubas*. We can access the text as a list of characters, and access 200 characters starting from position 10,000. >>> raw_text = machado.raw('romance/marm05.txt') >>> raw_text[10000:10200] u', primou no\nEstado, e foi um dos amigos particulares do vice-rei Conde da Cunha.\n\nComo este apelido de Cubas lhe\ncheirasse excessivamente a tanoaria, alegava meu pai, bisneto de Dami\xe3o, que o\ndito ape' However, this is not a very useful way to work with a text. We generally think of a text as a sequence of words and punctuation, not characters: >>> text1 = machado.words('romance/marm05.txt') >>> text1 ['Romance', ',', 'Mem\xf3rias', 'P\xf3stumas', 'de', ...] >>> len(text1) 77098 >>> len(set(text1)) 10848 Here's a program that finds the most common ngrams that contain a particular target word. >>> from nltk import ngrams, FreqDist >>> target_word = 'olhos' >>> fd = FreqDist(ng ... for ng in ngrams(text1, 5) ... if target_word in ng) >>> for hit in fd.samples(): ... print(' '.join(hit)) ... , com os olhos no com os olhos no ar com os olhos no chão e todos com os olhos me estar com os olhos os olhos estúpidos , a os olhos na costura , os olhos no ar , , com os olhos espetados , com os olhos estúpidos , com os olhos fitos , com os olhos naquele , com os olhos para Accessing the MacMorpho Tagged Corpus ------------------------------------- NLTK includes the MAC-MORPHO Brazilian Portuguese POS-tagged news text, with over a million words of journalistic texts extracted from ten sections of the daily newspaper *Folha de Sao Paulo*, 1994. We can access this corpus as a sequence of words or tagged words as follows: >>> import nltk.corpus >>> nltk.corpus.mac_morpho.words() ['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', ...] >>> nltk.corpus.mac_morpho.sents() # doctest: +NORMALIZE_WHITESPACE [['Jersei', 'atinge', 'm\xe9dia', 'de', 'Cr$', '1,4', 'milh\xe3o', 'em', 'a', 'venda', 'de', 'a', 'Pinhal', 'em', 'S\xe3o', 'Paulo'], ['Programe', 'sua', 'viagem', 'a', 'a', 'Exposi\xe7\xe3o', 'Nacional', 'do', 'Zebu', ',', 'que', 'come\xe7a', 'dia', '25'], ...] >>> nltk.corpus.mac_morpho.tagged_words() [('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ...] We can also access it in sentence chunks. >>> nltk.corpus.mac_morpho.tagged_sents() # doctest: +NORMALIZE_WHITESPACE [[('Jersei', 'N'), ('atinge', 'V'), ('m\xe9dia', 'N'), ('de', 'PREP'), ('Cr$', 'CUR'), ('1,4', 'NUM'), ('milh\xe3o', 'N'), ('em', 'PREP|+'), ('a', 'ART'), ('venda', 'N'), ('de', 'PREP|+'), ('a', 'ART'), ('Pinhal', 'NPROP'), ('em', 'PREP'), ('S\xe3o', 'NPROP'), ('Paulo', 'NPROP')], [('Programe', 'V'), ('sua', 'PROADJ'), ('viagem', 'N'), ('a', 'PREP|+'), ('a', 'ART'), ('Exposi\xe7\xe3o', 'NPROP'), ('Nacional', 'NPROP'), ('do', 'NPROP'), ('Zebu', 'NPROP'), (',', ','), ('que', 'PRO-KS-REL'), ('come\xe7a', 'V'), ('dia', 'N'), ('25', 'N|AP')], ...] This data can be used to train taggers (examples below for the Floresta treebank). Accessing the Floresta Portuguese Treebank ------------------------------------------ The NLTK data distribution includes the "Floresta Sinta(c)tica Corpus" version 7.4, available from ``http://www.linguateca.pt/Floresta/``. We can access this corpus as a sequence of words or tagged words as follows: >>> from nltk.corpus import floresta >>> floresta.words() ['Um', 'revivalismo', 'refrescante', 'O', '7_e_Meio', ...] >>> floresta.tagged_words() [('Um', '>N+art'), ('revivalismo', 'H+n'), ...] The tags consist of some syntactic information, followed by a plus sign, followed by a conventional part-of-speech tag. Let's strip off the material before the plus sign: >>> def simplify_tag(t): ... if "+" in t: ... return t[t.index("+")+1:] ... else: ... return t >>> twords = floresta.tagged_words() >>> twords = [(w.lower(), simplify_tag(t)) for (w,t) in twords] >>> twords[:10] [('um', 'art'), ('revivalismo', 'n'), ('refrescante', 'adj'), ('o', 'art'), ('7_e_meio', 'prop'), ('\xe9', 'v-fin'), ('um', 'art'), ('ex-libris', 'n'), ('de', 'prp'), ('a', 'art')] Pretty printing the tagged words: >>> print(' '.join(word + '/' + tag for (word, tag) in twords[:10])) um/art revivalismo/n refrescante/adj o/art 7_e_meio/prop é/v-fin um/art ex-libris/n de/prp a/art Count the word tokens and types, and determine the most common word: >>> words = floresta.words() >>> len(words) 211852 >>> fd = nltk.FreqDist(words) >>> len(fd) 29421 >>> fd.max() 'de' List the 20 most frequent tags, in order of decreasing frequency: >>> tags = [simplify_tag(tag) for (word,tag) in floresta.tagged_words()] >>> fd = nltk.FreqDist(tags) >>> fd.keys()[:20] # doctest: +NORMALIZE_WHITESPACE ['n', 'prp', 'art', 'v-fin', ',', 'prop', 'adj', 'adv', '.', 'conj-c', 'v-inf', 'pron-det', 'v-pcp', 'num', 'pron-indp', 'pron-pers', '\xab', '\xbb', 'conj-s', '}'] We can also access the corpus grouped by sentence: >>> floresta.sents() # doctest: +NORMALIZE_WHITESPACE [['Um', 'revivalismo', 'refrescante'], ['O', '7_e_Meio', '\xe9', 'um', 'ex-libris', 'de', 'a', 'noite', 'algarvia', '.'], ...] >>> floresta.tagged_sents() # doctest: +NORMALIZE_WHITESPACE [[('Um', '>N+art'), ('revivalismo', 'H+n'), ('refrescante', 'N<+adj')], [('O', '>N+art'), ('7_e_Meio', 'H+prop'), ('\xe9', 'P+v-fin'), ('um', '>N+art'), ('ex-libris', 'H+n'), ('de', 'H+prp'), ('a', '>N+art'), ('noite', 'H+n'), ('algarvia', 'N<+adj'), ('.', '.')], ...] >>> floresta.parsed_sents() # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS [Tree('UTT+np', [Tree('>N+art', ['Um']), Tree('H+n', ['revivalismo']), Tree('N<+adj', ['refrescante'])]), Tree('STA+fcl', [Tree('SUBJ+np', [Tree('>N+art', ['O']), Tree('H+prop', ['7_e_Meio'])]), Tree('P+v-fin', ['\xe9']), Tree('SC+np', [Tree('>N+art', ['um']), Tree('H+n', ['ex-libris']), Tree('N<+pp', [Tree('H+prp', ['de']), Tree('P<+np', [Tree('>N+art', ['a']), Tree('H+n', ['noite']), Tree('N<+adj', ['algarvia'])])])]), Tree('.', ['.'])]), ...] To view a parse tree, use the ``draw()`` method, e.g.: >>> psents = floresta.parsed_sents() >>> psents[5].draw() # doctest: +SKIP Character Encodings ------------------- Python understands the common character encoding used for Portuguese, ISO 8859-1 (ISO Latin 1). >>> import os, nltk.test >>> testdir = os.path.split(nltk.test.__file__)[0] >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO 8859-1') >>> text[:60] 'O 7 e Meio \xe9 um ex-libris da noite algarvia.\n\xc9 uma das mais ' >>> print(text[:60]) O 7 e Meio é um ex-libris da noite algarvia. É uma das mais For more information about character encodings and Python, please see section 3.3 of the book. ---------------- Processing Tasks ---------------- Simple Concordancing -------------------- Here's a function that takes a word and a specified amount of context (measured in characters), and generates a concordance for that word. >>> def concordance(word, context=30): ... for sent in floresta.sents(): ... if word in sent: ... pos = sent.index(word) ... left = ' '.join(sent[:pos]) ... right = ' '.join(sent[pos+1:]) ... print('%*s %s %-*s' % ... (context, left[-context:], word, context, right[:context])) >>> concordance("dar") # doctest: +SKIP anduru , foi o suficiente para dar a volta a o resultado . 1. O P?BLICO veio dar a a imprensa di?ria portuguesa A fartura de pensamento pode dar maus resultados e n?s n?o quer Come?a a dar resultados a pol?tica de a Uni ial come?ar a incorporar- lo e dar forma a um ' site ' que tem se r com Constantino para ele lhe dar tamb?m os pap?is assinados . va a brincar , pois n?o lhe ia dar procura??o nenhuma enquanto n? ?rica como o ant?doto capaz de dar sentido a o seu enorme poder . . . . >>> concordance("vender") # doctest: +SKIP er recebido uma encomenda para vender 4000 blindados a o Iraque . m?rico_Amorim caso conseguisse vender o lote de ac??es de o empres?r mpre ter jovens simp?ticos a ? vender ? chega ! } Disse que o governo vai vender ? desde autom?vel at? particip ndiciou ontem duas pessoas por vender carro com ?gio . A inten??o de Fleury ? vender as a??es para equilibrar as fi Part-of-Speech Tagging ---------------------- Let's begin by getting the tagged sentence data, and simplifying the tags as described earlier. >>> from nltk.corpus import floresta >>> tsents = floresta.tagged_sents() >>> tsents = [[(w.lower(),simplify_tag(t)) for (w,t) in sent] for sent in tsents if sent] >>> train = tsents[100:] >>> test = tsents[:100] We already know that ``n`` is the most common tag, so we can set up a default tagger that tags every word as a noun, and see how well it does: >>> tagger0 = nltk.DefaultTagger('n') >>> nltk.tag.accuracy(tagger0, test) 0.17697228144989338 Evidently, about one in every six words is a noun. Let's improve on this by training a unigram tagger: >>> tagger1 = nltk.UnigramTagger(train, backoff=tagger0) >>> nltk.tag.accuracy(tagger1, test) 0.87029140014214645 Next a bigram tagger: >>> tagger2 = nltk.BigramTagger(train, backoff=tagger1) >>> nltk.tag.accuracy(tagger2, test) 0.89019189765458417 Sentence Segmentation --------------------- Punkt is a language-neutral sentence segmentation tool. We >>> sent_tokenizer=nltk.data.load('tokenizers/punkt/portuguese.pickle') >>> raw_text = machado.raw('romance/marm05.txt') >>> sentences = sent_tokenizer.tokenize(raw_text) >>> for sent in sentences[1000:1005]: ... print("<<", sent, ">>") ... << Em verdade, parecia ainda mais mulher do que era; seria criança nos seus folgares de moça; mas assim quieta, impassível, tinha a compostura da mulher casada. >> << Talvez essa circunstância lhe diminuía um pouco da graça virginal. >> << Depressa nos familiarizamos; a mãe fazia-lhe grandes elogios, eu escutava-os de boa sombra, e ela sorria com os olhos fúlgidos, como se lá dentro do cérebro lhe estivesse a voar uma borboletinha de asas de ouro e olhos de diamante... >> << Digo lá dentro, porque cá fora o que esvoaçou foi uma borboleta preta, que subitamente penetrou na varanda, e começou a bater as asas em derredor de D. Eusébia. >> << D. Eusébia deu um grito, levantou-se, praguejou umas palavras soltas: - T'esconjuro!... >> The sentence tokenizer can be trained and evaluated on other text. The source text (from the Floresta Portuguese Treebank) contains one sentence per line. We read the text, split it into its lines, and then join these lines together using spaces. Now the information about sentence breaks has been discarded. We split this material into training and testing data: >>> import os, nltk.test >>> testdir = os.path.split(nltk.test.__file__)[0] >>> text = open(os.path.join(testdir, 'floresta.txt'), 'rb').read().decode('ISO-8859-1') >>> lines = text.split('\n') >>> train = ' '.join(lines[10:]) >>> test = ' '.join(lines[:10]) Now we train the sentence segmenter (or sentence tokenizer) and use it on our test sentences: >>> stok = nltk.PunktSentenceTokenizer(train) >>> print(stok.tokenize(test)) ['O 7 e Meio \xe9 um ex-libris da noite algarvia.', '\xc9 uma das mais antigas discotecas do Algarve, situada em Albufeira, que continua a manter os tra\xe7os decorativos e as clientelas de sempre.', '\xc9 um pouco a vers\xe3o de uma esp\xe9cie de \xaboutro lado\xbb da noite, a meio caminho entre os devaneios de uma fauna perif\xe9rica, seja de Lisboa, Londres, Dublin ou Faro e Portim\xe3o, e a postura circunspecta dos fi\xe9is da casa, que dela esperam a m\xfasica \xabgeracionista\xbb dos 60 ou dos 70.', 'N\xe3o deixa de ser, nos tempos que correm, um certo \xabvery typical\xbb algarvio, cabe\xe7a de cartaz para os que querem fugir a algumas movimenta\xe7\xf5es nocturnas j\xe1 a caminho da ritualiza\xe7\xe3o de massas, do g\xe9nero \xabvamos todos ao Calypso e encontramo-nos na Locomia\xbb.', 'E assim, aos 2,5 milh\xf5es que o Minist\xe9rio do Planeamento e Administra\xe7\xe3o do Territ\xf3rio j\xe1 gasta no pagamento do pessoal afecto a estes organismos, v\xeam juntar-se os montantes das obras propriamente ditas, que os munic\xedpios, j\xe1 com projectos na m\xe3o, v\xeam reivindicar junto do Executivo, como salienta aquele membro do Governo.', 'E o dinheiro \xabn\xe3o falta s\xf3 \xe0s c\xe2maras\xbb, lembra o secret\xe1rio de Estado, que considera que a solu\xe7\xe3o para as autarquias \xe9 \xabespecializarem-se em fundos comunit\xe1rios\xbb.', 'Mas como, se muitas n\xe3o disp\xf5em, nos seus quadros, dos t\xe9cnicos necess\xe1rios?', '\xabEncomendem-nos a projectistas de fora\xbb porque, se as obras vierem a ser financiadas, eles at\xe9 saem de gra\xe7a, j\xe1 que, nesse caso, \xabos fundos comunit\xe1rios pagam os projectos, o mesmo n\xe3o acontecendo quando eles s\xe3o feitos pelos GAT\xbb, dado serem organismos do Estado.', 'Essa poder\xe1 vir a ser uma hip\xf3tese, at\xe9 porque, no terreno, a capacidade dos GAT est\xe1 cada vez mais enfraquecida.', 'Alguns at\xe9 j\xe1 desapareceram, como o de Castro Verde, e outros t\xeam vindo a perder quadros.'] NLTK's data collection includes a trained model for Portuguese sentence segmentation, which can be loaded as follows. It is faster to load a trained model than to retrain it. >>> stok = nltk.data.load('tokenizers/punkt/portuguese.pickle') Stemming -------- NLTK includes the RSLP Portuguese stemmer. Here we use it to stem some Portuguese text: >>> stemmer = nltk.stem.RSLPStemmer() >>> stemmer.stem("copiar") 'copi' >>> stemmer.stem("paisagem") 'pais' Stopwords --------- NLTK includes Portuguese stopwords: >>> stopwords = nltk.corpus.stopwords.words('portuguese') >>> stopwords[:10] ['a', 'ao', 'aos', 'aquela', 'aquelas', 'aquele', 'aqueles', 'aquilo', 'as', 'at\xe9'] Now we can use these to filter text. Let's find the most frequent words (other than stopwords) and print them in descending order of frequency: >>> fd = nltk.FreqDist(w.lower() for w in floresta.words() if w not in stopwords) >>> for word in list(fd.keys())[:20]: ... print(word, fd[word]) , 13444 . 7725 « 2369 » 2310 é 1305 o 1086 } 1047 { 1044 a 897 ; 633 em 516 ser 466 sobre 349 os 313 anos 301 ontem 292 ainda 279 segundo 256 ter 249 dois 231 nltk-3.1/nltk/test/portuguese_en_fixt.py0000644000076500000240000000067212607224144020302 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import from nltk.compat import PY3 from nltk.corpus import teardown_module def setup_module(module): from nose import SkipTest raise SkipTest("portuguese_en.doctest imports nltk.examples.pt which doesn't exist!") if not PY3: raise SkipTest( "portuguese_en.doctest was skipped because non-ascii doctests are not supported under Python 2.x" ) nltk-3.1/nltk/test/probability.doctest0000644000076500000240000001672712607224144017731 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT =========== Probability =========== >>> import nltk >>> from nltk.probability import * FreqDist -------- >>> text1 = ['no', 'good', 'fish', 'goes', 'anywhere', 'without', 'a', 'porpoise', '!'] >>> text2 = ['no', 'good', 'porpoise', 'likes', 'to', 'fish', 'fish', 'anywhere', '.'] >>> fd1 = nltk.FreqDist(text1) >>> fd1 == nltk.FreqDist(text1) True Note that items are sorted in order of decreasing frequency; two items of the same frequency appear in indeterminate order. >>> import itertools >>> both = nltk.FreqDist(text1 + text2) >>> both_most_common = both.most_common() >>> list(itertools.chain(*(sorted(ys) for k, ys in itertools.groupby(both_most_common, key=lambda t: t[1])))) [('fish', 3), ('anywhere', 2), ('good', 2), ('no', 2), ('porpoise', 2), ('!', 1), ('.', 1), ('a', 1), ('goes', 1), ('likes', 1), ('to', 1), ('without', 1)] >>> both == fd1 + nltk.FreqDist(text2) True >>> fd1 == nltk.FreqDist(text1) # But fd1 is unchanged True >>> fd2 = nltk.FreqDist(text2) >>> fd1.update(fd2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd1.update(text2) >>> fd1 == both True >>> fd1 = nltk.FreqDist(text1) >>> fd2 = nltk.FreqDist(fd1) >>> fd2 == fd1 True ``nltk.FreqDist`` can be pickled: >>> import pickle >>> fd1 = nltk.FreqDist(text1) >>> pickled = pickle.dumps(fd1) >>> fd1 == pickle.loads(pickled) True Testing some HMM estimators --------------------------- We extract a small part (500 sentences) of the Brown corpus >>> corpus = nltk.corpus.brown.tagged_sents(categories='adventure')[:500] >>> print(len(corpus)) 500 We create a HMM trainer - note that we need the tags and symbols from the whole corpus, not just the training corpus >>> from nltk.util import unique_list >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) >>> print(len(tag_set)) 92 >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> print(len(symbols)) 1464 >>> print(len(tag_set)) 92 >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> print(len(symbols)) 1464 >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) We divide the corpus into 90% training and 10% testing >>> train_corpus = [] >>> test_corpus = [] >>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]] >>> print(len(train_corpus)) 450 >>> print(len(test_corpus)) 50 And now we can test the estimators >>> def train_and_test(est): ... hmm = trainer.train_supervised(train_corpus, estimator=est) ... print('%.2f%%' % (100 * hmm.evaluate(test_corpus))) Maximum Likelihood Estimation ----------------------------- - this resulted in an initialization error before r7209 >>> mle = lambda fd, bins: MLEProbDist(fd) >>> train_and_test(mle) 22.75% Laplace (= Lidstone with gamma==1) >>> train_and_test(LaplaceProbDist) 66.04% Expected Likelihood Estimation (= Lidstone with gamma==0.5) >>> train_and_test(ELEProbDist) 73.01% Lidstone Estimation, for gamma==0.1, 0.5 and 1 (the later two should be exactly equal to MLE and ELE above) >>> def lidstone(gamma): ... return lambda fd, bins: LidstoneProbDist(fd, gamma, bins) >>> train_and_test(lidstone(0.1)) 82.51% >>> train_and_test(lidstone(0.5)) 73.01% >>> train_and_test(lidstone(1.0)) 66.04% Witten Bell Estimation ---------------------- - This resulted in ZeroDivisionError before r7209 >>> train_and_test(WittenBellProbDist) 88.12% Good Turing Estimation >>> gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5) >>> train_and_test(gt) 86.93% Kneser Ney Estimation --------------------- Since the Kneser-Ney distribution is best suited for trigrams, we must adjust our testing accordingly. >>> corpus = [[((x[0],y[0],z[0]),(x[1],y[1],z[1])) ... for x, y, z in nltk.trigrams(sent)] ... for sent in corpus[:100]] We will then need to redefine the rest of the training/testing variables >>> tag_set = unique_list(tag for sent in corpus for (word,tag) in sent) >>> len(tag_set) 906 >>> symbols = unique_list(word for sent in corpus for (word,tag) in sent) >>> len(symbols) 1341 >>> trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) >>> train_corpus = [] >>> test_corpus = [] >>> for i in range(len(corpus)): ... if i % 10: ... train_corpus += [corpus[i]] ... else: ... test_corpus += [corpus[i]] >>> len(train_corpus) 90 >>> len(test_corpus) 10 >>> kn = lambda fd, bins: KneserNeyProbDist(fd) >>> train_and_test(kn) 0.86% Remains to be added: - Tests for HeldoutProbDist, CrossValidationProbDist and MutableProbDist Squashed bugs ------------- Issue 511: override pop and popitem to invalidate the cache >>> fd = nltk.FreqDist('a') >>> list(fd.keys()) ['a'] >>> fd.pop('a') 1 >>> list(fd.keys()) [] Issue 533: access cumulative frequencies with no arguments >>> fd = nltk.FreqDist('aab') >>> list(fd._cumulative_frequencies(['a'])) [2.0] >>> list(fd._cumulative_frequencies(['a', 'b'])) [2.0, 3.0] Issue 579: override clear to reset some variables >>> fd = FreqDist('aab') >>> fd.clear() >>> fd.N() 0 Issue 351: fix fileids method of CategorizedCorpusReader to inadvertently add errant categories >>> from nltk.corpus import brown >>> brown.fileids('blah') Traceback (most recent call last): ... ValueError: Category blah not found >>> brown.categories() ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] Issue 175: add the unseen bin to SimpleGoodTuringProbDist by default otherwise any unseen events get a probability of zero, i.e., they don't get smoothed >>> from nltk import SimpleGoodTuringProbDist, FreqDist >>> fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10}) >>> p = SimpleGoodTuringProbDist(fd) >>> p.prob('a') 0.017649766667026317... >>> p.prob('o') 0.08433050215340411... >>> p.prob('z') 0.022727272727272728... >>> p.prob('foobar') 0.022727272727272728... ``MLEProbDist``, ``ConditionalProbDist'', ``DictionaryConditionalProbDist`` and ``ConditionalFreqDist`` can be pickled: >>> import pickle >>> pd = MLEProbDist(fd) >>> sorted(pd.samples()) == sorted(pickle.loads(pickle.dumps(pd)).samples()) True >>> dpd = DictionaryConditionalProbDist({'x': pd}) >>> unpickled = pickle.loads(pickle.dumps(dpd)) >>> dpd['x'].prob('a') 0.011363636... >>> dpd['x'].prob('a') == unpickled['x'].prob('a') True >>> cfd = nltk.probability.ConditionalFreqDist() >>> cfd['foo']['hello'] += 1 >>> cfd['foo']['hello'] += 1 >>> cfd['bar']['hello'] += 1 >>> cfd2 = pickle.loads(pickle.dumps(cfd)) >>> cfd2 == cfd True >>> cpd = ConditionalProbDist(cfd, SimpleGoodTuringProbDist) >>> cpd2 = pickle.loads(pickle.dumps(cpd)) >>> cpd['foo'].prob('hello') == cpd2['foo'].prob('hello') True nltk-3.1/nltk/test/probability_fixt.py0000644000076500000240000000052512574600335017736 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import # probability.doctest uses HMM which requires numpy; # skip probability.doctest if numpy is not available def setup_module(module): from nose import SkipTest try: import numpy except ImportError: raise SkipTest("probability.doctest requires numpy")nltk-3.1/nltk/test/propbank.doctest0000644000076500000240000001503212607224144017211 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ======== PropBank ======== The PropBank Corpus provides predicate-argument annotation for the entire Penn Treebank. Each verb in the treebank is annotated by a single instance in PropBank, containing information about the location of the verb, and the location and identity of its arguments: >>> from nltk.corpus import propbank >>> pb_instances = propbank.instances() >>> print(pb_instances) # doctest: +NORMALIZE_WHITESPACE [, , ...] Each propbank instance defines the following member variables: - Location information: `fileid`, `sentnum`, `wordnum` - Annotator information: `tagger` - Inflection information: `inflection` - Roleset identifier: `roleset` - Verb (aka predicate) location: `predicate` - Argument locations and types: `arguments` The following examples show the types of these arguments: >>> inst = pb_instances[103] >>> (inst.fileid, inst.sentnum, inst.wordnum) ('wsj_0004.mrg', 8, 16) >>> inst.tagger 'gold' >>> inst.inflection >>> infl = inst.inflection >>> infl.form, infl.tense, infl.aspect, infl.person, infl.voice ('v', 'p', '-', '-', 'a') >>> inst.roleset 'rise.01' >>> inst.predicate PropbankTreePointer(16, 0) >>> inst.arguments # doctest: +NORMALIZE_WHITESPACE ((PropbankTreePointer(0, 2), 'ARG1'), (PropbankTreePointer(13, 1), 'ARGM-DIS'), (PropbankTreePointer(17, 1), 'ARG4-to'), (PropbankTreePointer(20, 1), 'ARG3-from')) The location of the predicate and of the arguments are encoded using `PropbankTreePointer` objects, as well as `PropbankChainTreePointer` objects and `PropbankSplitTreePointer` objects. A `PropbankTreePointer` consists of a `wordnum` and a `height`: >>> print(inst.predicate.wordnum, inst.predicate.height) 16 0 This identifies the tree constituent that is headed by the word that is the `wordnum`\ 'th token in the sentence, and whose span is found by going `height` nodes up in the tree. This type of pointer is only useful if we also have the corresponding tree structure, since it includes empty elements such as traces in the word number count. The trees for 10% of the standard PropBank Corpus are contained in the `treebank` corpus: >>> tree = inst.tree >>> from nltk.corpus import treebank >>> assert tree == treebank.parsed_sents(inst.fileid)[inst.sentnum] >>> inst.predicate.select(tree) Tree('VBD', ['rose']) >>> for (argloc, argid) in inst.arguments: ... print('%-10s %s' % (argid, argloc.select(tree).pformat(500)[:50])) ARG1 (NP-SBJ (NP (DT The) (NN yield)) (PP (IN on) (NP ( ARGM-DIS (PP (IN for) (NP (NN example))) ARG4-to (PP-DIR (TO to) (NP (CD 8.04) (NN %))) ARG3-from (PP-DIR (IN from) (NP (CD 7.90) (NN %))) Propbank tree pointers can be converted to standard tree locations, which are usually easier to work with, using the `treepos()` method: >>> treepos = inst.predicate.treepos(tree) >>> print (treepos, tree[treepos]) (4, 0) (VBD rose) In some cases, argument locations will be encoded using `PropbankChainTreePointer`\ s (for trace chains) or `PropbankSplitTreePointer`\ s (for discontinuous constituents). Both of these objects contain a single member variable, `pieces`, containing a list of the constituent pieces. They also define the method `select()`, which will return a tree containing all the elements of the argument. (A new head node is created, labeled "*CHAIN*" or "*SPLIT*", since the argument is not a single constituent in the original tree). Sentence #6 contains an example of an argument that is both discontinuous and contains a chain: >>> inst = pb_instances[6] >>> inst.roleset 'expose.01' >>> argloc, argid = inst.arguments[2] >>> argloc >>> argloc.pieces [, PropbankTreePointer(27, 0)] >>> argloc.pieces[0].pieces ... # doctest: +NORMALIZE_WHITESPACE [PropbankTreePointer(22, 1), PropbankTreePointer(24, 0), PropbankTreePointer(25, 1)] >>> print(argloc.select(inst.tree)) (*CHAIN* (*SPLIT* (NP (DT a) (NN group)) (IN of) (NP (NNS workers))) (-NONE- *)) The PropBank Corpus also provides access to the frameset files, which define the argument labels used by the annotations, on a per-verb basis. Each frameset file contains one or more predicates, such as 'turn' or 'turn_on', each of which is divided into coarse-grained word senses called rolesets. For each roleset, the frameset file provides descriptions of the argument roles, along with examples. >>> expose_01 = propbank.roleset('expose.01') >>> turn_01 = propbank.roleset('turn.01') >>> print(turn_01) # doctest: +ELLIPSIS >>> for role in turn_01.findall("roles/role"): ... print(role.attrib['n'], role.attrib['descr']) 0 turner 1 thing turning m direction, location >>> from xml.etree import ElementTree >>> print(ElementTree.tostring(turn_01.find('example')).decode('utf8').strip()) John turned the key in the lock. John turned the key in the lock Note that the standard corpus distribution only contains 10% of the treebank, so the parse trees are not available for instances starting at 9353: >>> inst = pb_instances[9352] >>> inst.fileid 'wsj_0199.mrg' >>> print(inst.tree) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS (S (NP-SBJ (NNP Trinity)) (VP (VBD said) (SBAR (-NONE- 0) ...)) >>> print(inst.predicate.select(inst.tree)) (VB begin) >>> inst = pb_instances[9353] >>> inst.fileid 'wsj_0200.mrg' >>> print(inst.tree) None >>> print(inst.predicate.select(inst.tree)) Traceback (most recent call last): . . . ValueError: Parse tree not avaialable However, if you supply your own version of the treebank corpus (by putting it before the nltk-provided version on `nltk.data.path`, or by creating a `ptb` directory as described above and using the `propbank_ptb` module), then you can access the trees for all instances. A list of the verb lemmas contained in PropBank is returned by the `propbank.verbs()` method: >>> propbank.verbs() ['abandon', 'abate', 'abdicate', 'abet', 'abide', ...] nltk-3.1/nltk/test/relextract.doctest0000644000076500000240000002234012607224144017552 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ====================== Information Extraction ====================== Information Extraction standardly consists of three subtasks: #. Named Entity Recognition #. Relation Extraction #. Template Filling Named Entities ~~~~~~~~~~~~~~ The IEER corpus is marked up for a variety of Named Entities. A `Named Entity`:dt: (more strictly, a Named Entity mention) is a name of an entity belonging to a specified class. For example, the Named Entity classes in IEER include PERSON, LOCATION, ORGANIZATION, DATE and so on. Within NLTK, Named Entities are represented as subtrees within a chunk structure: the class name is treated as node label, while the entity mention itself appears as the leaves of the subtree. This is illustrated below, where we have show an extract of the chunk representation of document NYT_19980315.064: >>> from nltk.corpus import ieer >>> docs = ieer.parsed_docs('NYT_19980315') >>> tree = docs[1].text >>> print(tree) # doctest: +ELLIPSIS (DOCUMENT ... ``It's a chance to think about first-level questions,'' said Ms. (PERSON Cohn) , a partner in the (ORGANIZATION McGlashan & Sarrail) firm in (LOCATION San Mateo) , (LOCATION Calif.) ...) Thus, the Named Entity mentions in this example are *Cohn*, *McGlashan & Sarrail*, *San Mateo* and *Calif.*. The CoNLL2002 Dutch and Spanish data is treated similarly, although in this case, the strings are also POS tagged. >>> from nltk.corpus import conll2002 >>> for doc in conll2002.chunked_sents('ned.train')[27]: ... print(doc) (u'Het', u'Art') (ORG Hof/N van/Prep Cassatie/N) (u'verbrak', u'V') (u'het', u'Art') (u'arrest', u'N') (u'zodat', u'Conj') (u'het', u'Pron') (u'moest', u'V') (u'worden', u'V') (u'overgedaan', u'V') (u'door', u'Prep') (u'het', u'Art') (u'hof', u'N') (u'van', u'Prep') (u'beroep', u'N') (u'van', u'Prep') (LOC Antwerpen/N) (u'.', u'Punc') Relation Extraction ~~~~~~~~~~~~~~~~~~~ Relation Extraction standardly consists of identifying specified relations between Named Entities. For example, assuming that we can recognize ORGANIZATIONs and LOCATIONs in text, we might want to also recognize pairs *(o, l)* of these kinds of entities such that *o* is located in *l*. The `sem.relextract` module provides some tools to help carry out a simple version of this task. The `tree2semi_rel()` function splits a chunk document into a list of two-member lists, each of which consists of a (possibly empty) string followed by a `Tree` (i.e., a Named Entity): >>> from nltk.sem import relextract >>> pairs = relextract.tree2semi_rel(tree) >>> for s, tree in pairs[18:22]: ... print('("...%s", %s)' % (" ".join(s[-5:]),tree)) ("...about first-level questions,'' said Ms.", (PERSON Cohn)) ("..., a partner in the", (ORGANIZATION McGlashan & Sarrail)) ("...firm in", (LOCATION San Mateo)) ("...,", (LOCATION Calif.)) The function `semi_rel2reldict()` processes triples of these pairs, i.e., pairs of the form ``((string1, Tree1), (string2, Tree2), (string3, Tree3))`` and outputs a dictionary (a `reldict`) in which ``Tree1`` is the subject of the relation, ``string2`` is the filler and ``Tree3`` is the object of the relation. ``string1`` and ``string3`` are stored as left and right context respectively. >>> reldicts = relextract.semi_rel2reldict(pairs) >>> for k, v in sorted(reldicts[0].items()): ... print(k, '=>', v) # doctest: +ELLIPSIS filler => of messages to their own ``Cyberia'' ... lcon => transactions.'' Each week, they post objclass => ORGANIZATION objsym => white_house objtext => White House rcon => for access to its planned subjclass => CARDINAL subjsym => hundreds subjtext => hundreds untagged_filler => of messages to their own ``Cyberia'' ... The next example shows some of the values for two `reldict`\ s corresponding to the ``'NYT_19980315'`` text extract shown earlier. >>> for r in reldicts[18:20]: ... print('=' * 20) ... print(r['subjtext']) ... print(r['filler']) ... print(r['objtext']) ==================== Cohn , a partner in the McGlashan & Sarrail ==================== McGlashan & Sarrail firm in San Mateo The function `relextract()` allows us to filter the `reldict`\ s according to the classes of the subject and object named entities. In addition, we can specify that the filler text has to match a given regular expression, as illustrated in the next example. Here, we are looking for pairs of entities in the IN relation, where IN has signature . >>> import re >>> IN = re.compile(r'.*\bin\b(?!\b.+ing\b)') >>> for fileid in ieer.fileids(): ... for doc in ieer.parsed_docs(fileid): ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN): ... print(relextract.rtuple(rel)) # doctest: +ELLIPSIS [ORG: 'Christian Democrats'] ', the leading political forces in' [LOC: 'Italy'] [ORG: 'AP'] ') _ Lebanese guerrillas attacked Israeli forces in southern' [LOC: 'Lebanon'] [ORG: 'Security Council'] 'adopted Resolution 425. Huge yellow banners hung across intersections in' [LOC: 'Beirut'] [ORG: 'U.N.'] 'failures in' [LOC: 'Africa'] [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia'] [ORG: 'U.N.'] 'partners on a more effective role in' [LOC: 'Africa'] [ORG: 'AP'] ') _ A bomb exploded in a mosque in central' [LOC: 'San`a'] [ORG: 'Krasnoye Sormovo'] 'shipyard in the Soviet city of' [LOC: 'Gorky'] [ORG: 'Kelab Golf Darul Ridzuan'] 'in' [LOC: 'Perak'] [ORG: 'U.N.'] 'peacekeeping operation in' [LOC: 'Somalia'] [ORG: 'WHYY'] 'in' [LOC: 'Philadelphia'] [ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo'] [ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington'] [ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington'] [ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles'] [ORG: 'Open Text'] ', based in' [LOC: 'Waterloo'] ... The next example illustrates a case where the patter is a disjunction of roles that a PERSON can occupy in an ORGANIZATION. >>> roles = """ ... (.*( ... analyst| ... chair(wo)?man| ... commissioner| ... counsel| ... director| ... economist| ... editor| ... executive| ... foreman| ... governor| ... head| ... lawyer| ... leader| ... librarian).*)| ... manager| ... partner| ... president| ... producer| ... professor| ... researcher| ... spokes(wo)?man| ... writer| ... ,\sof\sthe?\s* # "X, of (the) Y" ... """ >>> ROLES = re.compile(roles, re.VERBOSE) >>> for fileid in ieer.fileids(): ... for doc in ieer.parsed_docs(fileid): ... for rel in relextract.extract_rels('PER', 'ORG', doc, corpus='ieer', pattern=ROLES): ... print(relextract.rtuple(rel)) # doctest: +ELLIPSIS [PER: 'Kivutha Kibwana'] ', of the' [ORG: 'National Convention Assembly'] [PER: 'Boban Boskovic'] ', chief executive of the' [ORG: 'Plastika'] [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations'] [PER: 'Kiriyenko'] 'became a foreman at the' [ORG: 'Krasnoye Sormovo'] [PER: 'Annan'] ', the first sub-Saharan African to head the' [ORG: 'United Nations'] [PER: 'Mike Godwin'] ', chief counsel for the' [ORG: 'Electronic Frontier Foundation'] ... In the case of the CoNLL2002 data, we can include POS tags in the query pattern. This example also illustrates how the output can be presented as something that looks more like a clause in a logical language. >>> de = """ ... .* ... ( ... de/SP| ... del/SP ... ) ... """ >>> DE = re.compile(de, re.VERBOSE) >>> rels = [rel for doc in conll2002.chunked_sents('esp.train') ... for rel in relextract.extract_rels('ORG', 'LOC', doc, corpus='conll2002', pattern = DE)] >>> for r in rels[:10]: ... print(relextract.clause(r, relsym='DE')) # doctest: +NORMALIZE_WHITESPACE DE(u'tribunal_supremo', u'victoria') DE(u'museo_de_arte', u'alcorc\xf3n') DE(u'museo_de_bellas_artes', u'a_coru\xf1a') DE(u'siria', u'l\xedbano') DE(u'uni\xf3n_europea', u'pek\xedn') DE(u'ej\xe9rcito', u'rogberi') DE(u'juzgado_de_instrucci\xf3n_n\xfamero_1', u'san_sebasti\xe1n') DE(u'psoe', u'villanueva_de_la_serena') DE(u'ej\xe9rcito', u'l\xedbano') DE(u'juzgado_de_lo_penal_n\xfamero_2', u'ceuta') >>> vnv = """ ... ( ... is/V| ... was/V| ... werd/V| ... wordt/V ... ) ... .* ... van/Prep ... """ >>> VAN = re.compile(vnv, re.VERBOSE) >>> for doc in conll2002.chunked_sents('ned.train'): ... for r in relextract.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN): ... print(relextract.clause(r, relsym="VAN")) VAN(u"cornet_d'elzius", u'buitenlandse_handel') VAN(u'johan_rottiers', u'kardinaal_van_roey_instituut') VAN(u'annie_lennox', u'eurythmics') nltk-3.1/nltk/test/resolution.doctest0000644000076500000240000001715312607224144017606 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ========================= Resolution Theorem Prover ========================= >>> from nltk.inference.resolution import * >>> from nltk.sem import logic >>> from nltk.sem.logic import * >>> logic._counter._value = 0 >>> read_expr = logic.Expression.fromstring >>> P = read_expr('P') >>> Q = read_expr('Q') >>> R = read_expr('R') >>> A = read_expr('A') >>> B = read_expr('B') >>> x = read_expr('x') >>> y = read_expr('y') >>> z = read_expr('z') ------------------------------- Test most_general_unification() ------------------------------- >>> print(most_general_unification(x, x)) {} >>> print(most_general_unification(A, A)) {} >>> print(most_general_unification(A, x)) {x: A} >>> print(most_general_unification(x, A)) {x: A} >>> print(most_general_unification(x, y)) {x: y} >>> print(most_general_unification(P(x), P(A))) {x: A} >>> print(most_general_unification(P(x,B), P(A,y))) {x: A, y: B} >>> print(most_general_unification(P(x,B), P(B,x))) {x: B} >>> print(most_general_unification(P(x,y), P(A,x))) {x: A, y: x} >>> print(most_general_unification(P(Q(x)), P(y))) {y: Q(x)} ------------ Test unify() ------------ >>> print(Clause([]).unify(Clause([]))) [] >>> print(Clause([P(x)]).unify(Clause([-P(A)]))) [{}] >>> print(Clause([P(A), Q(x)]).unify(Clause([-P(x), R(x)]))) [{R(A), Q(A)}] >>> print(Clause([P(A), Q(x), R(x,y)]).unify(Clause([-P(x), Q(y)]))) [{Q(y), Q(A), R(A,y)}] >>> print(Clause([P(A), -Q(y)]).unify(Clause([-P(x), Q(B)]))) [{}] >>> print(Clause([P(x), Q(x)]).unify(Clause([-P(A), -Q(B)]))) [{-Q(B), Q(A)}, {-P(A), P(B)}] >>> print(Clause([P(x,x), Q(x), R(x)]).unify(Clause([-P(A,z), -Q(B)]))) [{-Q(B), Q(A), R(A)}, {-P(A,z), R(B), P(B,B)}] >>> a = clausify(read_expr('P(A)')) >>> b = clausify(read_expr('A=B')) >>> print(a[0].unify(b[0])) [{P(B)}] ------------------------- Test is_tautology() ------------------------- >>> print(Clause([P(A), -P(A)]).is_tautology()) True >>> print(Clause([-P(A), P(A)]).is_tautology()) True >>> print(Clause([P(x), -P(A)]).is_tautology()) False >>> print(Clause([Q(B), -P(A), P(A)]).is_tautology()) True >>> print(Clause([-Q(A), P(R(A)), -P(R(A)), Q(x), -R(y)]).is_tautology()) True >>> print(Clause([P(x), -Q(A)]).is_tautology()) False ------------------------- Test subsumes() ------------------------- >>> print(Clause([P(A), Q(B)]).subsumes(Clause([P(A), Q(B)]))) True >>> print(Clause([-P(A)]).subsumes(Clause([P(A)]))) False >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), P(A)]))) True >>> print(Clause([P(A), Q(B)]).subsumes(Clause([Q(B), R(A), P(A)]))) True >>> print(Clause([P(A), R(A), Q(B)]).subsumes(Clause([Q(B), P(A)]))) False >>> print(Clause([P(x)]).subsumes(Clause([P(A)]))) True >>> print(Clause([P(A)]).subsumes(Clause([P(x)]))) True ------------ Test prove() ------------ >>> print(ResolutionProverCommand(read_expr('man(x)')).prove()) False >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) -> --man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('-(man(x) & -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) | -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) -> man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('(man(x) <-> man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('-(man(x) <-> -man(x))')).prove()) True >>> print(ResolutionProverCommand(read_expr('all x.man(x)')).prove()) False >>> print(ResolutionProverCommand(read_expr('-all x.some y.F(x,y) & some x.all y.(-F(x,y))')).prove()) False >>> print(ResolutionProverCommand(read_expr('some x.all y.sees(x,y)')).prove()) False >>> p1 = read_expr('all x.(man(x) -> mortal(x))') >>> p2 = read_expr('man(Socrates)') >>> c = read_expr('mortal(Socrates)') >>> ResolutionProverCommand(c, [p1,p2]).prove() True >>> p1 = read_expr('all x.(man(x) -> walks(x))') >>> p2 = read_expr('man(John)') >>> c = read_expr('some y.walks(y)') >>> ResolutionProverCommand(c, [p1,p2]).prove() True >>> p = read_expr('some e1.some e2.(believe(e1,john,e2) & walk(e2,mary))') >>> c = read_expr('some e0.walk(e0,mary)') >>> ResolutionProverCommand(c, [p]).prove() True ------------ Test proof() ------------ >>> p1 = read_expr('all x.(man(x) -> mortal(x))') >>> p2 = read_expr('man(Socrates)') >>> c = read_expr('mortal(Socrates)') >>> logic._counter._value = 0 >>> tp = ResolutionProverCommand(c, [p1,p2]) >>> tp.prove() True >>> print(tp.proof()) [1] {-mortal(Socrates)} A [2] {-man(z2), mortal(z2)} A [3] {man(Socrates)} A [4] {-man(Socrates)} (1, 2) [5] {mortal(Socrates)} (2, 3) [6] {} (1, 5) ------------------ Question Answering ------------------ One answer >>> p1 = read_expr('father_of(art,john)') >>> p2 = read_expr('father_of(bob,kim)') >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))') >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))') >>> logic._counter._value = 0 >>> tp = ResolutionProverCommand(None, [p1,p2,p3,c]) >>> sorted(tp.find_answers()) [] >>> print(tp.proof()) # doctest: +SKIP [1] {father_of(art,john)} A [2] {father_of(bob,kim)} A [3] {-father_of(z3,z4), parent_of(z3,z4)} A [4] {-parent_of(z6,john), ANSWER(z6)} A [5] {parent_of(art,john)} (1, 3) [6] {parent_of(bob,kim)} (2, 3) [7] {ANSWER(z6), -father_of(z6,john)} (3, 4) [8] {ANSWER(art)} (1, 7) [9] {ANSWER(art)} (4, 5) Multiple answers >>> p1 = read_expr('father_of(art,john)') >>> p2 = read_expr('mother_of(ann,john)') >>> p3 = read_expr('all x.all y.(father_of(x,y) -> parent_of(x,y))') >>> p4 = read_expr('all x.all y.(mother_of(x,y) -> parent_of(x,y))') >>> c = read_expr('all x.(parent_of(x,john) -> ANSWER(x))') >>> logic._counter._value = 0 >>> tp = ResolutionProverCommand(None, [p1,p2,p3,p4,c]) >>> sorted(tp.find_answers()) [, ] >>> print(tp.proof()) # doctest: +SKIP [ 1] {father_of(art,john)} A [ 2] {mother_of(ann,john)} A [ 3] {-father_of(z3,z4), parent_of(z3,z4)} A [ 4] {-mother_of(z7,z8), parent_of(z7,z8)} A [ 5] {-parent_of(z10,john), ANSWER(z10)} A [ 6] {parent_of(art,john)} (1, 3) [ 7] {parent_of(ann,john)} (2, 4) [ 8] {ANSWER(z10), -father_of(z10,john)} (3, 5) [ 9] {ANSWER(art)} (1, 8) [10] {ANSWER(z10), -mother_of(z10,john)} (4, 5) [11] {ANSWER(ann)} (2, 10) [12] {ANSWER(art)} (5, 6) [13] {ANSWER(ann)} (5, 7) nltk-3.1/nltk/test/runtests.py0000755000076500000240000000402612574600335016256 0ustar sbstaff00000000000000#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import absolute_import, print_function import sys import os import nose from nose.plugins.manager import PluginManager from nose.plugins.doctests import Doctest from nose.plugins import builtin NLTK_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')) sys.path.insert(0, NLTK_ROOT) NLTK_TEST_DIR = os.path.join(NLTK_ROOT, 'nltk') if __name__ == '__main__': # there shouldn't be import from NLTK for coverage to work properly from doctest_nose_plugin import DoctestFix class NltkPluginManager(PluginManager): """ Nose plugin manager that replaces standard doctest plugin with a patched version. """ def loadPlugins(self): for plug in builtin.plugins: if plug != Doctest: self.addPlugin(plug()) self.addPlugin(DoctestFix()) super(NltkPluginManager, self).loadPlugins() manager = NltkPluginManager() manager.loadPlugins() # allow passing extra options and running individual tests # Examples: # # python runtests.py semantics.doctest # python runtests.py --with-id -v # python runtests.py --with-id -v nltk.featstruct args = sys.argv[1:] if not args: args = [NLTK_TEST_DIR] if all(arg.startswith('-') for arg in args): # only extra options were passed args += [NLTK_TEST_DIR] arguments = [ '--exclude=', # why is this needed? #'--with-xunit', #'--xunit-file=$WORKSPACE/nosetests.xml', #'--nocapture', '--with-doctest', #'--doctest-tests', #'--debug=nose,nose.importer,nose.inspector,nose.plugins,nose.result,nose.selector', '--doctest-extension=.doctest', '--doctest-fixtures=_fixt', '--doctest-options=+ELLIPSIS,+NORMALIZE_WHITESPACE,+IGNORE_EXCEPTION_DETAIL,+ALLOW_UNICODE,doctestencoding=utf-8', #'--verbosity=3', ] + args nose.main(argv=arguments, plugins=manager.plugins) nltk-3.1/nltk/test/segmentation_fixt.py0000644000076500000240000000044212574600335020111 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import # skip segmentation.doctest if numpy is not available def setup_module(module): from nose import SkipTest try: import numpy except ImportError: raise SkipTest("segmentation.doctest requires numpy")nltk-3.1/nltk/test/semantics.doctest0000644000076500000240000005762712607224144017403 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ========= Semantics ========= >>> import nltk >>> from nltk.sem import Valuation, Model >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ... ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] >>> val = Valuation(v) >>> dom = val.domain >>> m = Model(dom, val) Evaluation ---------- The top-level method of a ``Model`` instance is ``evaluate()``, which assigns a semantic value to expressions of the ``logic`` module, under an assignment ``g``: >>> dom = val.domain >>> g = nltk.sem.Assignment(dom) >>> m.evaluate('all x.(boy(x) -> - girl(x))', g) True ``evaluate()`` calls a recursive function ``satisfy()``, which in turn calls a function ``i()`` to interpret non-logical constants and individual variables. ``i()`` delegates the interpretation of these to the the model's ``Valuation`` and the variable assignment ``g`` respectively. Any atomic expression which cannot be assigned a value by ``i`` raises an ``Undefined`` exception; this is caught by ``evaluate``, which returns the string ``'Undefined'``. >>> m.evaluate('walk(adam)', g, trace=2) 'walk(adam)' is undefined under M, g 'Undefined' Batch Processing ---------------- The utility functions ``interpret_sents()`` and ``evaluate_sents()`` are intended to help with processing multiple sentences. Here's an example of the first of these: >>> sents = ['Mary walks'] >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg') >>> for result in results: ... for (synrep, semrep) in result: ... print(synrep) (S[SEM=] (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>] (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary)) (VP[NUM='sg', SEM=<\x.walk(x)>] (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks))) In order to provide backwards compatibility with 'legacy' grammars where the semantics value is specified with a lowercase ``sem`` feature, the relevant feature name can be passed to the function using the ``semkey`` parameter, as shown here: >>> sents = ['raining'] >>> g = nltk.grammar.FeatureGrammar.fromstring(""" ... % start S ... S[sem=] -> 'raining' ... """) >>> results = nltk.sem.util.interpret_sents(sents, g, semkey='sem') >>> for result in results: ... for (synrep, semrep) in result: ... print(semrep) raining The function ``evaluate_sents()`` works in a similar manner, but also needs to be passed a ``Model`` against which the semantic representations are evaluated. Unit Tests ========== Unit tests for relations and valuations --------------------------------------- >>> from nltk.sem import * Relations are sets of tuples, all of the same length. >>> s1 = set([('d1', 'd2'), ('d1', 'd1'), ('d2', 'd1')]) >>> is_rel(s1) True >>> s2 = set([('d1', 'd2'), ('d1', 'd2'), ('d1',)]) >>> is_rel(s2) Traceback (most recent call last): . . . ValueError: Set set([('d1', 'd2'), ('d1',)]) contains sequences of different lengths >>> s3 = set(['d1', 'd2']) >>> is_rel(s3) Traceback (most recent call last): . . . ValueError: Set set(['d2', 'd1']) contains sequences of different lengths >>> s4 = set2rel(s3) >>> is_rel(s4) True >>> is_rel(set()) True >>> null_binary_rel = set([(None, None)]) >>> is_rel(null_binary_rel) True Sets of entities are converted into sets of singleton tuples (containing strings). >>> sorted(set2rel(s3)) [('d1',), ('d2',)] >>> sorted(set2rel(set([1,3,5,]))) ['1', '3', '5'] >>> set2rel(set()) == set() True >>> set2rel(set2rel(s3)) == set2rel(s3) True Predication is evaluated by set membership. >>> ('d1', 'd2') in s1 True >>> ('d2', 'd2') in s1 False >>> ('d1',) in s1 False >>> 'd2' in s1 False >>> ('d1',) in s4 True >>> ('d1',) in set() False >>> 'd1' in null_binary_rel False >>> val = Valuation([('Fido', 'd1'), ('dog', set(['d1', 'd2'])), ('walk', set())]) >>> sorted(val['dog']) [('d1',), ('d2',)] >>> val.domain == set(['d1', 'd2']) True >>> print(val.symbols) ['Fido', 'dog', 'walk'] Parse a valuation from a string. >>> v = """ ... john => b1 ... mary => g1 ... suzie => g2 ... fido => d1 ... tess => d2 ... noosa => n ... girl => {g1, g2} ... boy => {b1, b2} ... dog => {d1, d2} ... bark => {d1, d2} ... walk => {b1, g2, d1} ... chase => {(b1, g1), (b2, g1), (g1, d1), (g2, d2)} ... see => {(b1, g1), (b2, d2), (g1, b1),(d2, b1), (g2, n)} ... in => {(b1, n), (b2, n), (d2, n)} ... with => {(b1, g1), (g1, b1), (d1, b1), (b1, d1)} ... """ >>> val = Valuation.fromstring(v) >>> print(val) # doctest: +SKIP {'bark': set([('d1',), ('d2',)]), 'boy': set([('b1',), ('b2',)]), 'chase': set([('b1', 'g1'), ('g2', 'd2'), ('g1', 'd1'), ('b2', 'g1')]), 'dog': set([('d1',), ('d2',)]), 'fido': 'd1', 'girl': set([('g2',), ('g1',)]), 'in': set([('d2', 'n'), ('b1', 'n'), ('b2', 'n')]), 'john': 'b1', 'mary': 'g1', 'noosa': 'n', 'see': set([('b1', 'g1'), ('b2', 'd2'), ('d2', 'b1'), ('g2', 'n'), ('g1', 'b1')]), 'suzie': 'g2', 'tess': 'd2', 'walk': set([('d1',), ('b1',), ('g2',)]), 'with': set([('b1', 'g1'), ('d1', 'b1'), ('b1', 'd1'), ('g1', 'b1')])} Unit tests for function argument application in a Model ------------------------------------------------------- >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\ ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')])), ... ('kiss', null_binary_rel)] >>> val = Valuation(v) >>> dom = val.domain >>> m = Model(dom, val) >>> g = Assignment(dom) >>> sorted(val['boy']) [('b1',), ('b2',)] >>> ('b1',) in val['boy'] True >>> ('g1',) in val['boy'] False >>> ('foo',) in val['boy'] False >>> ('b1', 'g1') in val['love'] True >>> ('b1', 'b1') in val['kiss'] False >>> sorted(val.domain) ['b1', 'b2', 'd1', 'g1', 'g2'] Model Tests =========== Extension of Lambda expressions >>> v0 = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'),\ ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ... ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] >>> val0 = Valuation(v0) >>> dom0 = val0.domain >>> m0 = Model(dom0, val0) >>> g0 = Assignment(dom0) >>> print(m0.evaluate(r'\x. \y. love(x, y)', g0) == {'g2': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'b2': {'g2': True, 'b2': False, 'b1': False, 'g1': False, 'd1': False}, 'b1': {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}, 'g1': {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}, 'd1': {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}}) True >>> print(m0.evaluate(r'\x. dog(x) (adam)', g0)) False >>> print(m0.evaluate(r'\x. (dog(x) | boy(x)) (adam)', g0)) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(fido)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(adam)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': True, 'd1': False}) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)', g0) == {'g2': False, 'b2': False, 'b1': True, 'g1': False, 'd1': False}) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty)(adam)', g0)) True >>> print(m0.evaluate(r'\x. \y. love(x, y)(betty, adam)', g0)) True >>> print(m0.evaluate(r'\y. \x. love(x, y)(fido)(adam)', g0)) False >>> print(m0.evaluate(r'\y. \x. love(x, y)(betty, adam)', g0)) True >>> print(m0.evaluate(r'\x. exists y. love(x, y)', g0) == {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}) True >>> print(m0.evaluate(r'\z. adam', g0) == {'g2': 'b1', 'b2': 'b1', 'b1': 'b1', 'g1': 'b1', 'd1': 'b1'}) True >>> print(m0.evaluate(r'\z. love(x, y)', g0) == {'g2': False, 'b2': False, 'b1': False, 'g1': False, 'd1': False}) True Propositional Model Test ------------------------ >>> tests = [ ... ('P & Q', True), ... ('P & R', False), ... ('- P', False), ... ('- R', True), ... ('- - P', True), ... ('- (P & R)', True), ... ('P | R', True), ... ('R | P', True), ... ('R | R', False), ... ('- P | R', False), ... ('P | - P', True), ... ('P -> Q', True), ... ('P -> R', False), ... ('R -> P', True), ... ('P <-> P', True), ... ('R <-> R', True), ... ('P <-> R', False), ... ] >>> val1 = Valuation([('P', True), ('Q', True), ('R', False)]) >>> dom = set([]) >>> m = Model(dom, val1) >>> g = Assignment(dom) >>> for (sent, testvalue) in tests: ... semvalue = m.evaluate(sent, g) ... if semvalue == testvalue: ... print('*', end=' ') * * * * * * * * * * * * * * * * * Test of i Function ------------------ >>> from nltk.sem import Expression >>> v = [('adam', 'b1'), ('betty', 'g1'), ('fido', 'd1'), ... ('girl', set(['g1', 'g2'])), ('boy', set(['b1', 'b2'])), ('dog', set(['d1'])), ... ('love', set([('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')]))] >>> val = Valuation(v) >>> dom = val.domain >>> m = Model(dom, val) >>> g = Assignment(dom, [('x', 'b1'), ('y', 'g2')]) >>> exprs = ['adam', 'girl', 'love', 'walks', 'x', 'y', 'z'] >>> parsed_exprs = [Expression.fromstring(e) for e in exprs] >>> sorted_set = lambda x: sorted(x) if isinstance(x, set) else x >>> for parsed in parsed_exprs: ... try: ... print("'%s' gets value %s" % (parsed, sorted_set(m.i(parsed, g)))) ... except Undefined: ... print("'%s' is Undefined" % parsed) 'adam' gets value b1 'girl' gets value [('g1',), ('g2',)] 'love' gets value [('b1', 'g1'), ('b2', 'g2'), ('g1', 'b1'), ('g2', 'b1')] 'walks' is Undefined 'x' gets value b1 'y' gets value g2 'z' is Undefined Test for formulas in Model -------------------------- >>> tests = [ ... ('love(adam, betty)', True), ... ('love(adam, sue)', 'Undefined'), ... ('dog(fido)', True), ... ('- dog(fido)', False), ... ('- - dog(fido)', True), ... ('- dog(sue)', 'Undefined'), ... ('dog(fido) & boy(adam)', True), ... ('- (dog(fido) & boy(adam))', False), ... ('- dog(fido) & boy(adam)', False), ... ('dog(fido) | boy(adam)', True), ... ('- (dog(fido) | boy(adam))', False), ... ('- dog(fido) | boy(adam)', True), ... ('- dog(fido) | - boy(adam)', False), ... ('dog(fido) -> boy(adam)', True), ... ('- (dog(fido) -> boy(adam))', False), ... ('- dog(fido) -> boy(adam)', True), ... ('exists x . love(adam, x)', True), ... ('all x . love(adam, x)', False), ... ('fido = fido', True), ... ('exists x . all y. love(x, y)', False), ... ('exists x . (x = fido)', True), ... ('all x . (dog(x) | - dog(x))', True), ... ('adam = mia', 'Undefined'), ... ('\\x. (boy(x) | girl(x))', {'g2': True, 'b2': True, 'b1': True, 'g1': True, 'd1': False}), ... ('\\x. exists y. (boy(x) & love(x, y))', {'g2': False, 'b2': True, 'b1': True, 'g1': False, 'd1': False}), ... ('exists z1. boy(z1)', True), ... ('exists x. (boy(x) & - (x = adam))', True), ... ('exists x. (boy(x) & all y. love(y, x))', False), ... ('all x. (boy(x) | girl(x))', False), ... ('all x. (girl(x) -> exists y. boy(y) & love(x, y))', False), ... ('exists x. (boy(x) & all y. (girl(y) -> love(y, x)))', True), ... ('exists x. (boy(x) & all y. (girl(y) -> love(x, y)))', False), ... ('all x. (dog(x) -> - girl(x))', True), ... ('exists x. exists y. (love(x, y) & love(x, y))', True), ... ] >>> for (sent, testvalue) in tests: ... semvalue = m.evaluate(sent, g) ... if semvalue == testvalue: ... print('*', end=' ') ... else: ... print(sent, semvalue) * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * Satisfier Tests --------------- >>> formulas = [ ... 'boy(x)', ... '(x = x)', ... '(boy(x) | girl(x))', ... '(boy(x) & girl(x))', ... 'love(adam, x)', ... 'love(x, adam)', ... '- (x = adam)', ... 'exists z22. love(x, z22)', ... 'exists y. love(y, x)', ... 'all y. (girl(y) -> love(x, y))', ... 'all y. (girl(y) -> love(y, x))', ... 'all y. (girl(y) -> (boy(x) & love(y, x)))', ... 'boy(x) & all y. (girl(y) -> love(x, y))', ... 'boy(x) & all y. (girl(y) -> love(y, x))', ... 'boy(x) & exists y. (girl(y) & love(y, x))', ... 'girl(x) -> dog(x)', ... 'all y. (dog(y) -> (x = y))', ... '- exists y. love(y, x)', ... 'exists y. (love(adam, y) & love(y, x))' ... ] >>> g.purge() >>> g.add('x', 'b1') {'x': 'b1'} >>> for f in formulas: # doctest: +NORMALIZE_WHITESPACE ... try: ... print("'%s' gets value: %s" % (f, m.evaluate(f, g))) ... except Undefined: ... print("'%s' is Undefined" % f) 'boy(x)' gets value: True '(x = x)' gets value: True '(boy(x) | girl(x))' gets value: True '(boy(x) & girl(x))' gets value: False 'love(adam, x)' gets value: False 'love(x, adam)' gets value: False '- (x = adam)' gets value: False 'exists z22. love(x, z22)' gets value: True 'exists y. love(y, x)' gets value: True 'all y. (girl(y) -> love(x, y))' gets value: False 'all y. (girl(y) -> love(y, x))' gets value: True 'all y. (girl(y) -> (boy(x) & love(y, x)))' gets value: True 'boy(x) & all y. (girl(y) -> love(x, y))' gets value: False 'boy(x) & all y. (girl(y) -> love(y, x))' gets value: True 'boy(x) & exists y. (girl(y) & love(y, x))' gets value: True 'girl(x) -> dog(x)' gets value: True 'all y. (dog(y) -> (x = y))' gets value: False '- exists y. love(y, x)' gets value: False 'exists y. (love(adam, y) & love(y, x))' gets value: True >>> from nltk.sem import Expression >>> for fmla in formulas: # doctest: +NORMALIZE_WHITESPACE ... p = Expression.fromstring(fmla) ... g.purge() ... print("Satisfiers of '%s':\n\t%s" % (p, sorted(m.satisfiers(p, 'x', g)))) Satisfiers of 'boy(x)': ['b1', 'b2'] Satisfiers of '(x = x)': ['b1', 'b2', 'd1', 'g1', 'g2'] Satisfiers of '(boy(x) | girl(x))': ['b1', 'b2', 'g1', 'g2'] Satisfiers of '(boy(x) & girl(x))': [] Satisfiers of 'love(adam,x)': ['g1'] Satisfiers of 'love(x,adam)': ['g1', 'g2'] Satisfiers of '-(x = adam)': ['b2', 'd1', 'g1', 'g2'] Satisfiers of 'exists z22.love(x,z22)': ['b1', 'b2', 'g1', 'g2'] Satisfiers of 'exists y.love(y,x)': ['b1', 'g1', 'g2'] Satisfiers of 'all y.(girl(y) -> love(x,y))': [] Satisfiers of 'all y.(girl(y) -> love(y,x))': ['b1'] Satisfiers of 'all y.(girl(y) -> (boy(x) & love(y,x)))': ['b1'] Satisfiers of '(boy(x) & all y.(girl(y) -> love(x,y)))': [] Satisfiers of '(boy(x) & all y.(girl(y) -> love(y,x)))': ['b1'] Satisfiers of '(boy(x) & exists y.(girl(y) & love(y,x)))': ['b1'] Satisfiers of '(girl(x) -> dog(x))': ['b1', 'b2', 'd1'] Satisfiers of 'all y.(dog(y) -> (x = y))': ['d1'] Satisfiers of '-exists y.love(y,x)': ['b2', 'd1'] Satisfiers of 'exists y.(love(adam,y) & love(y,x))': ['b1'] Tests based on the Blackburn & Bos testsuite -------------------------------------------- >>> v1 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'), ... ('honey_bunny', 'd4'), ('yolanda', 'd5'), ... ('customer', set(['d1', 'd2'])), ... ('robber', set(['d3', 'd4'])), ... ('love', set([('d3', 'd4')]))] >>> val1 = Valuation(v1) >>> dom1 = val1.domain >>> m1 = Model(dom1, val1) >>> g1 = Assignment(dom1) >>> v2 = [('jules', 'd1'), ('vincent', 'd2'), ('pumpkin', 'd3'), ... ('honey_bunny', 'd4'), ('yolanda', 'd4'), ... ('customer', set(['d1', 'd2', 'd5', 'd6'])), ... ('robber', set(['d3', 'd4'])), ... ('love', set([(None, None)]))] >>> val2 = Valuation(v2) >>> dom2 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6']) >>> m2 = Model(dom2, val2) >>> g2 = Assignment(dom2) >>> g21 = Assignment(dom2) >>> g21.add('y', 'd3') {'y': 'd3'} >>> v3 = [('mia', 'd1'), ('jody', 'd2'), ('jules', 'd3'), ... ('vincent', 'd4'), ... ('woman', set(['d1', 'd2'])), ('man', set(['d3', 'd4'])), ... ('joke', set(['d5', 'd6'])), ('episode', set(['d7', 'd8'])), ... ('in', set([('d5', 'd7'), ('d5', 'd8')])), ... ('tell', set([('d1', 'd5'), ('d2', 'd6')]))] >>> val3 = Valuation(v3) >>> dom3 = set(['d1', 'd2', 'd3', 'd4', 'd5', 'd6', 'd7', 'd8']) >>> m3 = Model(dom3, val3) >>> g3 = Assignment(dom3) >>> tests = [ ... ('exists x. robber(x)', m1, g1, True), ... ('exists x. exists y. love(y, x)', m1, g1, True), ... ('exists x0. exists x1. love(x1, x0)', m2, g2, False), ... ('all x. all y. love(y, x)', m2, g2, False), ... ('- (all x. all y. love(y, x))', m2, g2, True), ... ('all x. all y. - love(y, x)', m2, g2, True), ... ('yolanda = honey_bunny', m2, g2, True), ... ('mia = honey_bunny', m2, g2, 'Undefined'), ... ('- (yolanda = honey_bunny)', m2, g2, False), ... ('- (mia = honey_bunny)', m2, g2, 'Undefined'), ... ('all x. (robber(x) | customer(x))', m2, g2, True), ... ('- (all x. (robber(x) | customer(x)))', m2, g2, False), ... ('(robber(x) | customer(x))', m2, g2, 'Undefined'), ... ('(robber(y) | customer(y))', m2, g21, True), ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True), ... ('exists x. (man(x) & exists x. woman(x))', m3, g3, True), ... ('- exists x. woman(x)', m3, g3, False), ... ('exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'), ... ('- exists x. (tasty(x) & burger(x))', m3, g3, 'Undefined'), ... ('exists x. (man(x) & - exists y. woman(y))', m3, g3, False), ... ('exists x. (man(x) & - exists x. woman(x))', m3, g3, False), ... ('exists x. (woman(x) & - exists x. customer(x))', m2, g2, 'Undefined'), ... ] >>> for item in tests: ... sentence, model, g, testvalue = item ... semvalue = model.evaluate(sentence, g) ... if semvalue == testvalue: ... print('*', end=' ') ... g.purge() * * * * * * * * * * * * * * * * * * * * * * Tests for mapping from syntax to semantics ------------------------------------------ Load a valuation from a file. >>> import nltk.data >>> from nltk.sem.util import parse_sents >>> val = nltk.data.load('grammars/sample_grammars/valuation1.val') >>> dom = val.domain >>> m = Model(dom, val) >>> g = Assignment(dom) >>> gramfile = 'grammars/sample_grammars/sem2.fcfg' >>> inputs = ['John sees a girl', 'every dog barks'] >>> parses = parse_sents(inputs, gramfile) >>> for sent, trees in zip(inputs, parses): ... print() ... print("Sentence: %s" % sent) ... for tree in trees: ... print("Parse:\n %s" %tree) ... print("Semantics: %s" % root_semrep(tree)) Sentence: John sees a girl Parse: (S[SEM=] (NP[-LOC, NUM='sg', SEM=<\P.P(john)>] (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John)) (VP[NUM='sg', SEM=<\y.exists x.(girl(x) & see(y,x))>] (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees) (NP[NUM='sg', SEM=<\Q.exists x.(girl(x) & Q(x))>] (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a) (Nom[NUM='sg', SEM=<\x.girl(x)>] (N[NUM='sg', SEM=<\x.girl(x)>] girl))))) Semantics: exists x.(girl(x) & see(john,x)) Sentence: every dog barks Parse: (S[SEM= bark(x))>] (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) (Nom[NUM='sg', SEM=<\x.dog(x)>] (N[NUM='sg', SEM=<\x.dog(x)>] dog))) (VP[NUM='sg', SEM=<\x.bark(x)>] (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) Semantics: all x.(dog(x) -> bark(x)) >>> sent = "every dog barks" >>> result = nltk.sem.util.interpret_sents([sent], gramfile)[0] >>> for (syntree, semrep) in result: ... print(syntree) ... print() ... print(semrep) (S[SEM= bark(x))>] (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) (Nom[NUM='sg', SEM=<\x.dog(x)>] (N[NUM='sg', SEM=<\x.dog(x)>] dog))) (VP[NUM='sg', SEM=<\x.bark(x)>] (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) all x.(dog(x) -> bark(x)) >>> result = nltk.sem.util.evaluate_sents([sent], gramfile, m, g)[0] >>> for (syntree, semrel, value) in result: ... print(syntree) ... print() ... print(semrep) ... print() ... print(value) (S[SEM= bark(x))>] (NP[NUM='sg', SEM=<\Q.all x.(dog(x) -> Q(x))>] (Det[NUM='sg', SEM=<\P Q.all x.(P(x) -> Q(x))>] every) (Nom[NUM='sg', SEM=<\x.dog(x)>] (N[NUM='sg', SEM=<\x.dog(x)>] dog))) (VP[NUM='sg', SEM=<\x.bark(x)>] (IV[NUM='sg', SEM=<\x.bark(x)>, TNS='pres'] barks))) all x.(dog(x) -> bark(x)) True >>> sents = ['Mary walks', 'John sees a dog'] >>> results = nltk.sem.util.interpret_sents(sents, 'grammars/sample_grammars/sem2.fcfg') >>> for result in results: ... for (synrep, semrep) in result: ... print(synrep) (S[SEM=] (NP[-LOC, NUM='sg', SEM=<\P.P(mary)>] (PropN[-LOC, NUM='sg', SEM=<\P.P(mary)>] Mary)) (VP[NUM='sg', SEM=<\x.walk(x)>] (IV[NUM='sg', SEM=<\x.walk(x)>, TNS='pres'] walks))) (S[SEM=] (NP[-LOC, NUM='sg', SEM=<\P.P(john)>] (PropN[-LOC, NUM='sg', SEM=<\P.P(john)>] John)) (VP[NUM='sg', SEM=<\y.exists x.(dog(x) & see(y,x))>] (TV[NUM='sg', SEM=<\X y.X(\x.see(y,x))>, TNS='pres'] sees) (NP[NUM='sg', SEM=<\Q.exists x.(dog(x) & Q(x))>] (Det[NUM='sg', SEM=<\P Q.exists x.(P(x) & Q(x))>] a) (Nom[NUM='sg', SEM=<\x.dog(x)>] (N[NUM='sg', SEM=<\x.dog(x)>] dog))))) Cooper Storage -------------- >>> from nltk.sem import cooper_storage as cs >>> sentence = 'every girl chases a dog' >>> trees = cs.parse_with_bindops(sentence, grammar='grammars/book_grammars/storage.fcfg') >>> semrep = trees[0].label()['SEM'] >>> cs_semrep = cs.CooperStore(semrep) >>> print(cs_semrep.core) chase(z2,z4) >>> for bo in cs_semrep.store: ... print(bo) bo(\P.all x.(girl(x) -> P(x)),z2) bo(\P.exists x.(dog(x) & P(x)),z4) >>> cs_semrep.s_retrieve(trace=True) Permutation 1 (\P.all x.(girl(x) -> P(x)))(\z2.chase(z2,z4)) (\P.exists x.(dog(x) & P(x)))(\z4.all x.(girl(x) -> chase(x,z4))) Permutation 2 (\P.exists x.(dog(x) & P(x)))(\z4.chase(z2,z4)) (\P.all x.(girl(x) -> P(x)))(\z2.exists x.(dog(x) & chase(z2,x))) >>> for reading in cs_semrep.readings: ... print(reading) exists x.(dog(x) & all z3.(girl(z3) -> chase(z3,x))) all x.(girl(x) -> exists z4.(dog(z4) & chase(x,z4))) nltk-3.1/nltk/test/semantics_fixt.py0000644000076500000240000000031212574600335017376 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import # reset the variables counter before running tests def setup_module(module): from nltk.sem import logic logic._counter._value = 0 nltk-3.1/nltk/test/sentiment.doctest0000644000076500000240000002676512607224144017422 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT =================== Sentiment Analysis =================== >>> from nltk.classify import NaiveBayesClassifier >>> from nltk.corpus import subjectivity >>> from nltk.sentiment import SentimentAnalyzer >>> from nltk.sentiment.util import * >>> n_instances = 100 >>> subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]] >>> obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]] >>> len(subj_docs), len(obj_docs) (100, 100) Each document is represented by a tuple (sentence, label). The sentence is tokenized, so it is represented by a list of strings: >>> subj_docs[0] (['smart', 'and', 'alert', ',', 'thirteen', 'conversations', 'about', 'one', 'thing', 'is', 'a', 'small', 'gem', '.'], 'subj') We separately split subjective and objective instances to keep a balanced uniform class distribution in both train and test sets. >>> train_subj_docs = subj_docs[:80] >>> test_subj_docs = subj_docs[80:100] >>> train_obj_docs = obj_docs[:80] >>> test_obj_docs = obj_docs[80:100] >>> training_docs = train_subj_docs+train_obj_docs >>> testing_docs = test_subj_docs+test_obj_docs >>> sentim_analyzer = SentimentAnalyzer() >>> all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs]) We use simple unigram word features, handling negation: >>> unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4) >>> len(unigram_feats) 83 >>> sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats) We apply features to obtain a feature-value representation of our datasets: >>> training_set = sentim_analyzer.apply_features(training_docs) >>> test_set = sentim_analyzer.apply_features(testing_docs) We can now train our classifier on the training set, and subsequently output the evaluation results: >>> trainer = NaiveBayesClassifier.train >>> classifier = sentim_analyzer.train(trainer, training_set) Training classifier >>> for key,value in sorted(sentim_analyzer.evaluate(test_set).items()): ... print('{0}: {1}'.format(key, value)) Evaluating NaiveBayesClassifier results... Accuracy: 0.8 F-measure [obj]: 0.8 F-measure [subj]: 0.8 Precision [obj]: 0.8 Precision [subj]: 0.8 Recall [obj]: 0.8 Recall [subj]: 0.8 Vader ------ >>> from nltk.sentiment.vader import SentimentIntensityAnalyzer >>> sentences = ["VADER is smart, handsome, and funny.", # positive sentence example ... "VADER is smart, handsome, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted) ... "VADER is very smart, handsome, and funny.", # booster words handled correctly (sentiment intensity adjusted) ... "VADER is VERY SMART, handsome, and FUNNY.", # emphasis for ALLCAPS handled ... "VADER is VERY SMART, handsome, and FUNNY!!!",# combination of signals - VADER appropriately adjusts intensity ... "VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",# booster words & punctuation make this close to ceiling for score ... "The book was good.", # positive sentence ... "The book was kind of good.", # qualified positive sentence is handled correctly (intensity adjusted) ... "The plot was good, but the characters are uncompelling and the dialog is not great.", # mixed negation sentence ... "A really bad, horrible book.", # negative sentence with booster words ... "At least it isn't a horrible book.", # negated negative sentence with contraction ... ":) and :D", # emoticons handled ... "", # an empty string is correctly handled ... "Today sux", # negative slang handled ... "Today sux!", # negative slang with punctuation emphasis handled ... "Today SUX!", # negative slang with capitalization emphasis ... "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and constrastive conjunction "but" ... ] >>> paragraph = "It was one of the worst movies I've seen, despite good reviews. \ ... Unbelievably bad acting!! Poor direction. VERY poor production. \ ... The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!" >>> from nltk import tokenize >>> lines_list = tokenize.sent_tokenize(paragraph) >>> sentences.extend(lines_list) >>> tricky_sentences = [ ... "Most automated sentiment analysis tools are shit.", ... "VADER sentiment analysis is the shit.", ... "Sentiment analysis has never been good.", ... "Sentiment analysis with VADER has never been this good.", ... "Warren Beatty has never been so entertaining.", ... "I won't say that the movie is astounding and I wouldn't claim that \ ... the movie is too banal either.", ... "I like to hate Michael Bay films, but I couldn't fault this one", ... "It's one thing to watch an Uwe Boll film, but another thing entirely \ ... to pay for it", ... "The movie was too good", ... "This movie was actually neither that funny, nor super witty.", ... "This movie doesn't care about cleverness, wit or any other kind of \ ... intelligent humor.", ... "Those who find ugly meanings in beautiful things are corrupt without \ ... being charming.", ... "There are slow and repetitive parts, BUT it has just enough spice to \ ... keep it interesting.", ... "The script is not fantastic, but the acting is decent and the cinematography \ ... is EXCELLENT!", ... "Roger Dodger is one of the most compelling variations on this theme.", ... "Roger Dodger is one of the least compelling variations on this theme.", ... "Roger Dodger is at least compelling as a variation on the theme.", ... "they fall in love with the product", ... "but then it breaks", ... "usually around the time the 90 day warranty expires", ... "the twin towers collapsed today", ... "However, Mr. Carter solemnly argues, his client carried out the kidnapping \ ... under orders and in the ''least offensive way possible.''" ... ] >>> sentences.extend(tricky_sentences) >>> sid = SentimentIntensityAnalyzer() >>> for sentence in sentences: ... print(sentence) ... ss = sid.polarity_scores(sentence) ... for k in sorted(ss): ... print('{0}: {1}, '.format(k, ss[k]), end='') ... print() VADER is smart, handsome, and funny. compound: 0.8316, neg: 0.0, neu: 0.254, pos: 0.746, VADER is smart, handsome, and funny! compound: 0.8439, neg: 0.0, neu: 0.248, pos: 0.752, VADER is very smart, handsome, and funny. compound: 0.8545, neg: 0.0, neu: 0.299, pos: 0.701, VADER is VERY SMART, handsome, and FUNNY. compound: 0.9227, neg: 0.0, neu: 0.246, pos: 0.754, VADER is VERY SMART, handsome, and FUNNY!!! compound: 0.9342, neg: 0.0, neu: 0.233, pos: 0.767, VADER is VERY SMART, really handsome, and INCREDIBLY FUNNY!!! compound: 0.9469, neg: 0.0, neu: 0.294, pos: 0.706, The book was good. compound: 0.4404, neg: 0.0, neu: 0.508, pos: 0.492, The book was kind of good. compound: 0.3832, neg: 0.0, neu: 0.657, pos: 0.343, The plot was good, but the characters are uncompelling and the dialog is not great. compound: -0.7042, neg: 0.327, neu: 0.579, pos: 0.094, A really bad, horrible book. compound: -0.8211, neg: 0.791, neu: 0.209, pos: 0.0, At least it isn't a horrible book. compound: 0.431, neg: 0.0, neu: 0.637, pos: 0.363, :) and :D compound: 0.7925, neg: 0.0, neu: 0.124, pos: 0.876, compound: 0.0, neg: 0.0, neu: 0.0, pos: 0.0, Today sux compound: -0.3612, neg: 0.714, neu: 0.286, pos: 0.0, Today sux! compound: -0.4199, neg: 0.736, neu: 0.264, pos: 0.0, Today SUX! compound: -0.5461, neg: 0.779, neu: 0.221, pos: 0.0, Today kinda sux! But I'll get by, lol compound: 0.2228, neg: 0.195, neu: 0.531, pos: 0.274, It was one of the worst movies I've seen, despite good reviews. compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0, Unbelievably bad acting!! compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0, Poor direction. compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0, VERY poor production. compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0, The movie was bad. compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0, Very bad movie. compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0, VERY bad movie. compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0, VERY BAD movie. compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0, VERY BAD movie! compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0, Most automated sentiment analysis tools are shit. compound: -0.5574, neg: 0.375, neu: 0.625, pos: 0.0, VADER sentiment analysis is the shit. compound: 0.6124, neg: 0.0, neu: 0.556, pos: 0.444, Sentiment analysis has never been good. compound: -0.3412, neg: 0.325, neu: 0.675, pos: 0.0, Sentiment analysis with VADER has never been this good. compound: 0.5228, neg: 0.0, neu: 0.703, pos: 0.297, Warren Beatty has never been so entertaining. compound: 0.5777, neg: 0.0, neu: 0.616, pos: 0.384, I won't say that the movie is astounding and I wouldn't claim that the movie is too banal either. compound: 0.4215, neg: 0.0, neu: 0.851, pos: 0.149, I like to hate Michael Bay films, but I couldn't fault this one compound: 0.3153, neg: 0.157, neu: 0.534, pos: 0.309, It's one thing to watch an Uwe Boll film, but another thing entirely to pay for it compound: -0.2541, neg: 0.112, neu: 0.888, pos: 0.0, The movie was too good compound: 0.4404, neg: 0.0, neu: 0.58, pos: 0.42, This movie was actually neither that funny, nor super witty. compound: -0.6759, neg: 0.41, neu: 0.59, pos: 0.0, This movie doesn't care about cleverness, wit or any other kind of intelligent humor. compound: -0.1338, neg: 0.265, neu: 0.497, pos: 0.239, Those who find ugly meanings in beautiful things are corrupt without being charming. compound: -0.3553, neg: 0.314, neu: 0.493, pos: 0.192, There are slow and repetitive parts, BUT it has just enough spice to keep it interesting. compound: 0.4678, neg: 0.079, neu: 0.735, pos: 0.186, The script is not fantastic, but the acting is decent and the cinematography is EXCELLENT! compound: 0.7565, neg: 0.092, neu: 0.607, pos: 0.301, Roger Dodger is one of the most compelling variations on this theme. compound: 0.2944, neg: 0.0, neu: 0.834, pos: 0.166, Roger Dodger is one of the least compelling variations on this theme. compound: -0.1695, neg: 0.132, neu: 0.868, pos: 0.0, Roger Dodger is at least compelling as a variation on the theme. compound: 0.2263, neg: 0.0, neu: 0.84, pos: 0.16, they fall in love with the product compound: 0.6369, neg: 0.0, neu: 0.588, pos: 0.412, but then it breaks compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, usually around the time the 90 day warranty expires compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, the twin towers collapsed today compound: -0.2732, neg: 0.344, neu: 0.656, pos: 0.0, However, Mr. Carter solemnly argues, his client carried out the kidnapping under orders and in the ''least offensive way possible.'' compound: -0.5859, neg: 0.23, neu: 0.697, pos: 0.074,nltk-3.1/nltk/test/sentiwordnet.doctest0000644000076500000240000000167212607224144020127 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ====================== SentiWordNet Interface ====================== SentiWordNet can be imported like this: >>> from nltk.corpus import sentiwordnet as swn ------------ SentiSynsets ------------ >>> breakdown = swn.senti_synset('breakdown.n.03') >>> print(breakdown) >>> breakdown.pos_score() 0.0 >>> breakdown.neg_score() 0.25 >>> breakdown.obj_score() 0.75 ------ Lookup ------ >>> list(swn.senti_synsets('slow')) # doctest: +NORMALIZE_WHITESPACE [SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'), SentiSynset('slow.v.03'), SentiSynset('slow.a.01'), SentiSynset('slow.a.02'), SentiSynset('slow.a.04'), SentiSynset('slowly.r.01'), SentiSynset('behind.r.03')] >>> happy = swn.senti_synsets('happy', 'a') >>> all = swn.all_senti_synsets() nltk-3.1/nltk/test/simple.doctest0000644000076500000240000000454412607224144016674 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ================= EasyInstall Tests ================= This file contains some simple tests that will be run by EasyInstall in order to test the installation when NLTK-Data is absent. >>> from __future__ import print_function ------------ Tokenization ------------ >>> from nltk.tokenize import wordpunct_tokenize >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" ... "two of them.\n\nThanks.") >>> wordpunct_tokenize(s) # doctest: +NORMALIZE_WHITESPACE ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] ------- Metrics ------- >>> from nltk.metrics import precision, recall, f_measure >>> reference = 'DET NN VB DET JJ NN NN IN DET NN'.split() >>> test = 'DET VB VB DET NN NN NN IN DET NN'.split() >>> reference_set = set(reference) >>> test_set = set(test) >>> precision(reference_set, test_set) 1.0 >>> print(recall(reference_set, test_set)) 0.8 >>> print(f_measure(reference_set, test_set)) 0.88888888888... ------------------ Feature Structures ------------------ >>> from nltk import FeatStruct >>> fs1 = FeatStruct(PER=3, NUM='pl', GND='fem') >>> fs2 = FeatStruct(POS='N', AGR=fs1) >>> print(fs2) [ [ GND = 'fem' ] ] [ AGR = [ NUM = 'pl' ] ] [ [ PER = 3 ] ] [ ] [ POS = 'N' ] >>> print(fs2['AGR']) [ GND = 'fem' ] [ NUM = 'pl' ] [ PER = 3 ] >>> print(fs2['AGR']['PER']) 3 ------- Parsing ------- >>> from nltk.parse.recursivedescent import RecursiveDescentParser >>> from nltk.grammar import CFG >>> grammar = CFG.fromstring(""" ... S -> NP VP ... PP -> P NP ... NP -> 'the' N | N PP | 'the' N PP ... VP -> V NP | V PP | V NP PP ... N -> 'cat' | 'dog' | 'rug' ... V -> 'chased' ... P -> 'on' ... """) >>> rd = RecursiveDescentParser(grammar) >>> sent = 'the cat chased the dog on the rug'.split() >>> for t in rd.parse(sent): ... print(t) (S (NP the (N cat)) (VP (V chased) (NP the (N dog) (PP (P on) (NP the (N rug)))))) (S (NP the (N cat)) (VP (V chased) (NP the (N dog)) (PP (P on) (NP the (N rug))))) nltk-3.1/nltk/test/stem.doctest0000644000076500000240000000401212607224144016341 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ========== Stemmers ========== Overview ~~~~~~~~ Stemmers remove morphological affixes from words, leaving only the word stem. >>> from __future__ import print_function >>> from nltk.stem import * Unit tests for the Porter stemmer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> from nltk.stem.porter import * Create a new Porter stemmer. >>> stemmer = PorterStemmer() Test the stemmer on various pluralised words. >>> plurals = ['caresses', 'flies', 'dies', 'mules', 'denied', ... 'died', 'agreed', 'owned', 'humbled', 'sized', ... 'meeting', 'stating', 'siezing', 'itemization', ... 'sensational', 'traditional', 'reference', 'colonizer', ... 'plotted'] >>> singles = [stemmer.stem(plural) for plural in plurals] >>> print(' '.join(singles)) # doctest: +NORMALIZE_WHITESPACE caress fli die mule deni die agre own humbl size meet state siez item sensat tradit refer colon plot Unit tests for Snowball stemmer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >>> from nltk.stem.snowball import SnowballStemmer See which languages are supported. >>> print(" ".join(SnowballStemmer.languages)) danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish Create a new instance of a language specific subclass. >>> stemmer = SnowballStemmer("english") Stem a word. >>> print(stemmer.stem("running")) run Decide not to stem stopwords. >>> stemmer2 = SnowballStemmer("english", ignore_stopwords=True) >>> print(stemmer.stem("having")) have >>> print(stemmer2.stem("having")) having The 'english' stemmer is better than the original 'porter' stemmer. >>> print(SnowballStemmer("english").stem("generously")) generous >>> print(SnowballStemmer("porter").stem("generously")) gener .. note:: Extra stemmer tests can be found in `nltk.test.unit.test_stem`. nltk-3.1/nltk/test/tag.doctest0000644000076500000240000000133512607224144016151 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT Regression Tests ~~~~~~~~~~~~~~~~ Sequential Taggers ------------------ Add tests for: - make sure backoff is being done correctly. - make sure ngram taggers don't use previous sentences for context. - make sure ngram taggers see 'beginning of the sentence' as a unique context - make sure regexp tagger's regexps are tried in order - train on some simple examples, & make sure that the size & the generated models are correct. - make sure cutoff works as intended - make sure that ngram models only exclude contexts covered by the backoff tagger if the backoff tagger gets that context correct at *all* locations. nltk-3.1/nltk/test/tokenize.doctest0000644000076500000240000001167212607517365017245 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT >>> from __future__ import print_function >>> from nltk.tokenize import * Regression Tests: Treebank Tokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Some test strings. >>> s1 = "On a $50,000 mortgage of 30 years at 8 percent, the monthly payment would be $366.88." >>> word_tokenize(s1) ['On', 'a', '$', '50,000', 'mortgage', 'of', '30', 'years', 'at', '8', 'percent', ',', 'the', 'monthly', 'payment', 'would', 'be', '$', '366.88', '.'] >>> s2 = "\"We beat some pretty good teams to get here,\" Slocum said." >>> word_tokenize(s2) ['``', 'We', 'beat', 'some', 'pretty', 'good', 'teams', 'to', 'get', 'here', ',', "''", 'Slocum', 'said', '.'] >>> s3 = "Well, we couldn't have this predictable, cliche-ridden, \"Touched by an Angel\" (a show creator John Masius worked on) wanna-be if she didn't." >>> word_tokenize(s3) ['Well', ',', 'we', 'could', "n't", 'have', 'this', 'predictable', ',', 'cliche-ridden', ',', '``', 'Touched', 'by', 'an', 'Angel', "''", '(', 'a', 'show', 'creator', 'John', 'Masius', 'worked', 'on', ')', 'wanna-be', 'if', 'she', 'did', "n't", '.'] >>> s4 = "I cannot cannot work under these conditions!" >>> word_tokenize(s4) ['I', 'can', 'not', 'can', 'not', 'work', 'under', 'these', 'conditions', '!'] >>> s5 = "The company spent $30,000,000 last year." >>> word_tokenize(s5) ['The', 'company', 'spent', '$', '30,000,000', 'last', 'year', '.'] >>> s6 = "The company spent 40.75% of its income last year." >>> word_tokenize(s6) ['The', 'company', 'spent', '40.75', '%', 'of', 'its', 'income', 'last', 'year', '.'] >>> s7 = "He arrived at 3:00 pm." >>> word_tokenize(s7) ['He', 'arrived', 'at', '3:00', 'pm', '.'] >>> s8 = "I bought these items: books, pencils, and pens." >>> word_tokenize(s8) ['I', 'bought', 'these', 'items', ':', 'books', ',', 'pencils', ',', 'and', 'pens', '.'] >>> s9 = "Though there were 150, 100 of them were old." >>> word_tokenize(s9) ['Though', 'there', 'were', '150', ',', '100', 'of', 'them', 'were', 'old', '.'] >>> s10 = "There were 300,000, but that wasn't enough." >>> word_tokenize(s10) ['There', 'were', '300,000', ',', 'but', 'that', 'was', "n't", 'enough', '.'] Sentence tokenization in word_tokenize: >>> s11 = "I called Dr. Jones. I called Dr. Jones." >>> word_tokenize(s11) ['I', 'called', 'Dr.', 'Jones', '.', 'I', 'called', 'Dr.', 'Jones', '.'] >>> s12 = ("Ich muss unbedingt daran denken, Mehl, usw. fur einen " ... "Kuchen einzukaufen. Ich muss.") >>> word_tokenize(s12) ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw', '.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] >>> word_tokenize(s12, 'german') ['Ich', 'muss', 'unbedingt', 'daran', 'denken', ',', 'Mehl', ',', 'usw.', 'fur', 'einen', 'Kuchen', 'einzukaufen', '.', 'Ich', 'muss', '.'] Regression Tests: Regexp Tokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Some additional test strings. >>> s = ("Good muffins cost $3.88\nin New York. Please buy me\n" ... "two of them.\n\nThanks.") >>> s2 = ("Alas, it has not rained today. When, do you think, " ... "will it rain again?") >>> s3 = ("

    Although this is not the case here, we must " ... "not relax our vigilance!

    ") >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=False) [', ', '. ', ', ', ', ', '?'] >>> regexp_tokenize(s2, r'[,\.\?!"]\s*', gaps=True) ['Alas', 'it has not rained today', 'When', 'do you think', 'will it rain again'] Take care to avoid using capturing groups: >>> regexp_tokenize(s3, r'', gaps=False) ['

    ', '', '', '

    '] >>> regexp_tokenize(s3, r'', gaps=False) ['

    ', '', '', '

    '] >>> regexp_tokenize(s3, r'', gaps=True) ['Although this is ', 'not', ' the case here, we must not relax our vigilance!'] Named groups are capturing groups, and confuse the tokenizer: >>> regexp_tokenize(s3, r'b|p)>', gaps=False) ['p', 'b', 'b', 'p'] >>> regexp_tokenize(s3, r'b|p)>', gaps=True) ['p', 'Although this is ', 'b', 'not', 'b', ' the case here, we must not relax our vigilance!', 'p'] Make sure that nested groups don't confuse the tokenizer: >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=False) ['las', 'has', 'rai', 'rai'] >>> regexp_tokenize(s2, r'(?:h|r|l)a(?:s|(?:i|n0))', gaps=True) ['A', ', it ', ' not ', 'ned today. When, do you think, will it ', 'n again?'] Back-references require capturing groups, and these are not supported: >>> regexp_tokenize("aabbbcccc", r'(.)\1') ['a', 'b', 'c', 'c'] A simple sentence tokenizer '\.(\s+|$)' >>> regexp_tokenize(s, pattern=r'\.(?:\s+|$)', gaps=True) ['Good muffins cost $3.88\nin New York', 'Please buy me\ntwo of them', 'Thanks'] nltk-3.1/nltk/test/toolbox.doctest0000644000076500000240000002344212607224144017067 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT =============================== Unit test cases for ``toolbox`` =============================== >>> from nltk import toolbox -------------------------- ``toolbox.StandardFormat`` -------------------------- >>> f = toolbox.StandardFormat() ``toolbox.StandardFormat.open()`` --------------------------------- >>> import os, tempfile >>> (fd, fname) = tempfile.mkstemp() >>> tf = os.fdopen(fd, "w") >>> _ = tf.write('\\lx a value\n\\lx another value\n') >>> tf.close() >>> f = toolbox.StandardFormat() >>> f.open(fname) >>> list(f.fields()) [('lx', 'a value'), ('lx', 'another value')] >>> f.close() >>> os.unlink(fname) ``toolbox.StandardFormat.open_string()`` ---------------------------------------- >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value\n') >>> list(f.fields()) [('lx', 'a value'), ('lx', 'another value')] >>> f.close() ``toolbox.StandardFormat.close()`` ---------------------------------- >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value\n') >>> list(f.fields()) [('lx', 'a value'), ('lx', 'another value')] >>> f.close() ``toolbox.StandardFormat.line_num`` --------------------------------------- ``StandardFormat.line_num`` contains the line number of the last line returned: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value\n\\lx a third value\n') >>> line_nums = [] >>> for l in f.raw_fields(): ... line_nums.append(f.line_num) >>> line_nums [1, 2, 3] ``StandardFormat.line_num`` contains the line number of the last line returned: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n') >>> line_nums = [] >>> for l in f.raw_fields(): ... line_nums.append(f.line_num) >>> line_nums [2, 5, 7] ``StandardFormat.line_num`` doesn't exist before openning or after closing a file or string: >>> f = toolbox.StandardFormat() >>> f.line_num Traceback (most recent call last): ... AttributeError: 'StandardFormat' object has no attribute 'line_num' >>> f.open_string('\\lx two\nlines\n\\lx three\nlines\n\n\\lx two\nlines\n') >>> line_nums = [] >>> for l in f.raw_fields(): ... line_nums.append(f.line_num) >>> line_nums [2, 5, 7] >>> f.close() >>> f.line_num Traceback (most recent call last): ... AttributeError: 'StandardFormat' object has no attribute 'line_num' ``toolbox.StandardFormat.raw_fields()`` --------------------------------------- ``raw_fields()`` returns an iterator over tuples of two strings representing the marker and its value. The marker is given without the backslash and the value without its trailing newline: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value\n') >>> list(f.raw_fields()) [('lx', 'a value'), ('lx', 'another value')] an empty file returns nothing: >>> f = toolbox.StandardFormat() >>> f.open_string('') >>> list(f.raw_fields()) [] file with only a newline returns WHAT SHOULD IT RETURN???: >>> f = toolbox.StandardFormat() >>> f.open_string('\n') >>> list(f.raw_fields()) [(None, '')] file with only one field should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx one value\n') >>> list(f.raw_fields()) [('lx', 'one value')] file without a trailing newline should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\\lx another value') >>> list(f.raw_fields()) [('lx', 'a value'), ('lx', 'another value')] trailing white space is preserved except for the final newline: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n') >>> list(f.raw_fields()) [('lx', 'trailing space '), ('lx', 'trailing tab\t'), ('lx', 'extra newline\n')] line wrapping is preserved: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') >>> list(f.raw_fields()) [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')] file beginning with a multiline record should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') >>> list(f.raw_fields()) [('lx', 'a value\nmore of the value\nand still more'), ('lc', 'another val')] file ending with a multiline record should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lc a value\n\\lx another value\nmore of the value\nand still more\n') >>> list(f.raw_fields()) [('lc', 'a value'), ('lx', 'another value\nmore of the value\nand still more')] file beginning with a BOM should be parsed ok: >>> f = toolbox.StandardFormat() >>> f.open_string('\xef\xbb\xbf\\lx a value\n\\lx another value\n') >>> list(f.raw_fields()) [('lx', 'a value'), ('lx', 'another value')] file beginning with two BOMs should ignore only the first one: >>> f = toolbox.StandardFormat() >>> f.open_string('\xef\xbb\xbf\xef\xbb\xbf\\lx a value\n\\lx another value\n') >>> list(f.raw_fields()) [(None, '\xef\xbb\xbf\\lx a value'), ('lx', 'another value')] should not ignore a BOM not at the beginning of the file: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\n\xef\xbb\xbf\\lx another value\n') >>> list(f.raw_fields()) [('lx', 'a value\n\xef\xbb\xbf\\lx another value')] ``toolbox.StandardFormat.fields()`` ----------------------------------- trailing white space is not preserved: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx trailing space \n\\lx trailing tab\t\n\\lx extra newline\n\n') >>> list(f.fields()) [('lx', 'trailing space'), ('lx', 'trailing tab'), ('lx', 'extra newline')] multiline fields are unwrapped: >>> f = toolbox.StandardFormat() >>> f.open_string('\\lx a value\nmore of the value\nand still more\n\\lc another val\n') >>> list(f.fields()) [('lx', 'a value more of the value and still more'), ('lc', 'another val')] markers ------- A backslash in the first position on a new line indicates the start of a marker. The backslash is not part of the marker: >>> f = toolbox.StandardFormat() >>> f.open_string('\\mk a value\n') >>> list(f.fields()) [('mk', 'a value')] If the backslash occurs later in the line it does not indicate the start of a marker: >>> f = toolbox.StandardFormat() >>> f.open_string('\\mk a value\n \\mk another one\n') >>> list(f.raw_fields()) [('mk', 'a value\n \\mk another one')] There is no specific limit to the length of a marker: >>> f = toolbox.StandardFormat() >>> f.open_string('\\this_is_an_extremely_long_marker value\n') >>> list(f.fields()) [('this_is_an_extremely_long_marker', 'value')] A marker can contain any non white space character: >>> f = toolbox.StandardFormat() >>> f.open_string('\\`~!@#$%^&*()_-=+[{]}\|,<.>/?;:"0123456789 value\n') >>> list(f.fields()) [('`~!@#$%^&*()_-=+[{]}\\|,<.>/?;:"0123456789', 'value')] A marker is terminated by any white space character: >>> f = toolbox.StandardFormat() >>> f.open_string('\\mk a value\n\\mk\tanother one\n\\mk\rthird one\n\\mk\ffourth one') >>> list(f.fields()) [('mk', 'a value'), ('mk', 'another one'), ('mk', 'third one'), ('mk', 'fourth one')] Consecutive whitespace characters (except newline) are treated the same as one: >>> f = toolbox.StandardFormat() >>> f.open_string('\\mk \t\r\fa value\n') >>> list(f.fields()) [('mk', 'a value')] ----------------------- ``toolbox.ToolboxData`` ----------------------- >>> db = toolbox.ToolboxData() ``toolbox.ToolboxData.parse()`` ------------------------------- check that normal parsing works: >>> from xml.etree import ElementTree >>> td = toolbox.ToolboxData() >>> s = """\\_sh v3.0 400 Rotokas Dictionary ... \\_DateStampHasFourDigitYear ... ... \\lx kaa ... \\ps V.A ... \\ge gag ... \\gp nek i pas ... ... \\lx kaa ... \\ps V.B ... \\ge strangle ... \\gp pasim nek ... """ >>> td.open_string(s) >>> tree = td.parse(key='lx') >>> tree.tag 'toolbox_data' >>> ElementTree.tostring(list(tree)[0]).decode('utf8') '
    <_sh>v3.0 400 Rotokas Dictionary<_DateStampHasFourDigitYear />
    ' >>> ElementTree.tostring(list(tree)[1]).decode('utf8') 'kaaV.Agagnek i pas' >>> ElementTree.tostring(list(tree)[2]).decode('utf8') 'kaaV.Bstranglepasim nek' check that guessing the key marker works: >>> from xml.etree import ElementTree >>> td = toolbox.ToolboxData() >>> s = """\\_sh v3.0 400 Rotokas Dictionary ... \\_DateStampHasFourDigitYear ... ... \\lx kaa ... \\ps V.A ... \\ge gag ... \\gp nek i pas ... ... \\lx kaa ... \\ps V.B ... \\ge strangle ... \\gp pasim nek ... """ >>> td.open_string(s) >>> tree = td.parse() >>> ElementTree.tostring(list(tree)[0]).decode('utf8') '
    <_sh>v3.0 400 Rotokas Dictionary<_DateStampHasFourDigitYear />
    ' >>> ElementTree.tostring(list(tree)[1]).decode('utf8') 'kaaV.Agagnek i pas' >>> ElementTree.tostring(list(tree)[2]).decode('utf8') 'kaaV.Bstranglepasim nek' ----------------------- ``toolbox`` functions ----------------------- ``toolbox.to_sfm_string()`` ------------------------------- nltk-3.1/nltk/test/translate.doctest0000644000076500000240000002003712607224144017373 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT .. -*- coding: utf-8 -*- ========= Alignment ========= Corpus Reader ------------- >>> from nltk.corpus import comtrans >>> words = comtrans.words('alignment-en-fr.txt') >>> for word in words[:6]: ... print(word) Resumption of the session I declare >>> als = comtrans.aligned_sents('alignment-en-fr.txt')[0] >>> als # doctest: +NORMALIZE_WHITESPACE AlignedSent(['Resumption', 'of', 'the', 'session'], ['Reprise', 'de', 'la', 'session'], Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])) Alignment Objects ----------------- Aligned sentences are simply a mapping between words in a sentence: >>> print(" ".join(als.words)) Resumption of the session >>> print(" ".join(als.mots)) Reprise de la session >>> als.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, 3)]) Usually we look at them from the perspective of a source to a target language, but they are easily inverted: >>> als.invert() # doctest: +NORMALIZE_WHITESPACE AlignedSent(['Reprise', 'de', 'la', 'session'], ['Resumption', 'of', 'the', 'session'], Alignment([(0, 0), (1, 1), (2, 2), (3, 3)])) We can create new alignments, but these need to be in the correct range of the corresponding sentences: >>> from nltk.translate import Alignment, AlignedSent >>> als = AlignedSent(['Reprise', 'de', 'la', 'session'], ... ['Resumption', 'of', 'the', 'session'], ... Alignment([(0, 0), (1, 4), (2, 1), (3, 3)])) Traceback (most recent call last): ... IndexError: Alignment is outside boundary of mots You can set alignments with any sequence of tuples, so long as the first two indexes of the tuple are the alignment indices: >>> als.alignment = Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))]) >>> Alignment([(0, 0), (1, 1), (2, 2, "boat"), (3, 3, False, (1,2))]) Alignment([(0, 0), (1, 1), (2, 2, 'boat'), (3, 3, False, (1, 2))]) Alignment Algorithms -------------------- EM for IBM Model 1 ~~~~~~~~~~~~~~~~~~ Here is an example from Koehn, 2010: >>> from nltk.translate import IBMModel1 >>> corpus = [AlignedSent(['the', 'house'], ['das', 'Haus']), ... AlignedSent(['the', 'book'], ['das', 'Buch']), ... AlignedSent(['a', 'book'], ['ein', 'Buch'])] >>> em_ibm1 = IBMModel1(corpus, 20) >>> print(round(em_ibm1.translation_table['the']['das'], 1)) 1.0 >>> print(round(em_ibm1.translation_table['book']['das'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['house']['das'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['the']['Buch'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['book']['Buch'], 1)) 1.0 >>> print(round(em_ibm1.translation_table['a']['Buch'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['book']['ein'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['a']['ein'], 1)) 1.0 >>> print(round(em_ibm1.translation_table['the']['Haus'], 1)) 0.0 >>> print(round(em_ibm1.translation_table['house']['Haus'], 1)) 1.0 >>> print(round(em_ibm1.translation_table['book'][None], 1)) 0.5 And using an NLTK corpus. We train on only 10 sentences, since it is so slow: >>> from nltk.corpus import comtrans >>> com_ibm1 = IBMModel1(comtrans.aligned_sents()[:10], 20) >>> print(round(com_ibm1.translation_table['bitte']['Please'], 1)) 0.2 >>> print(round(com_ibm1.translation_table['Sitzungsperiode']['session'], 1)) 1.0 Evaluation ---------- The evaluation metrics for alignments are usually not interested in the contents of alignments but more often the comparison to a "gold standard" alignment that has been been constructed by human experts. For this reason we often want to work just with raw set operations against the alignment points. This then gives us a very clean form for defining our evaluation metrics. .. Note:: The AlignedSent class has no distinction of "possible" or "sure" alignments. Thus all alignments are treated as "sure". Consider the following aligned sentence for evaluation: >>> my_als = AlignedSent(['Resumption', 'of', 'the', 'session'], ... ['Reprise', 'de', 'la', 'session'], ... Alignment([(0, 0), (3, 3), (1, 2), (1, 1), (1, 3)])) Precision ~~~~~~~~~ ``precision = |A∩P| / |A|`` **Precision** is probably the most well known evaluation metric and it is implemented in `nltk.metrics.scores.precision`_. Since precision is simply interested in the proportion of correct alignments, we calculate the ratio of the number of our test alignments (*A*) that match a possible alignment (*P*), over the number of test alignments provided. There is no penalty for missing a possible alignment in our test alignments. An easy way to game this metric is to provide just one test alignment that is in *P* [OCH2000]_. Here are some examples: >>> from nltk.metrics import precision >>> als.alignment = Alignment([(0,0), (1,1), (2,2), (3,3)]) >>> precision(Alignment([]), als.alignment) 0.0 >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) 1.0 >>> precision(Alignment([(0,0), (3,3)]), als.alignment) 0.5 >>> precision(Alignment.fromstring('0-0 3-3'), als.alignment) 0.5 >>> precision(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment) 1.0 >>> precision(als.alignment, my_als.alignment) 0.6 .. _nltk.metrics.scores.precision: http://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.precision Recall ~~~~~~ ``recall = |A∩S| / |S|`` **Recall** is another well known evaluation metric that has a set based implementation in NLTK as `nltk.metrics.scores.recall`_. Since recall is simply interested in the proportion of found alignments, we calculate the ratio of the number of our test alignments (*A*) that match a sure alignment (*S*) over the number of sure alignments. There is no penalty for producing a lot of test alignments. An easy way to game this metric is to include every possible alignment in our test alignments, regardless if they are correct or not [OCH2000]_. Here are some examples: >>> from nltk.metrics import recall >>> print(recall(Alignment([]), als.alignment)) None >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) 1.0 >>> recall(Alignment.fromstring('0-0 3-3'), als.alignment) 1.0 >>> recall(Alignment([(0,0), (3,3)]), als.alignment) 1.0 >>> recall(Alignment([(0,0), (1,1), (2,2), (3,3), (1,2), (2,1)]), als.alignment) 0.66666... >>> recall(als.alignment, my_als.alignment) 0.75 .. _nltk.metrics.scores.recall: http://www.nltk.org/api/nltk.metrics.html#nltk.metrics.scores.recall Alignment Error Rate (AER) ~~~~~~~~~~~~~~~~~~~~~~~~~~ ``AER = 1 - (|A∩S| + |A∩P|) / (|A| + |S|)`` **Alignment Error Rate** is commonly used metric for assessing sentence alignments. It combines precision and recall metrics together such that a perfect alignment must have all of the sure alignments and may have some possible alignments [MIHALCEA2003]_ [KOEHN2010]_. .. Note:: [KOEHN2010]_ defines the AER as ``AER = (|A∩S| + |A∩P|) / (|A| + |S|)`` in his book, but corrects it to the above in his online errata. This is in line with [MIHALCEA2003]_. Here are some examples: >>> from nltk.translate import alignment_error_rate >>> alignment_error_rate(Alignment([]), als.alignment) 1.0 >>> alignment_error_rate(Alignment([(0,0), (1,1), (2,2), (3,3)]), als.alignment) 0.0 >>> alignment_error_rate(als.alignment, my_als.alignment) 0.333333... >>> alignment_error_rate(als.alignment, my_als.alignment, ... als.alignment | Alignment([(1,2), (2,1)])) 0.222222... .. [OCH2000] Och, F. and Ney, H. (2000) *Statistical Machine Translation*, EAMT Workshop .. [MIHALCEA2003] Mihalcea, R. and Pedersen, T. (2003) *An evaluation exercise for word alignment*, HLT-NAACL 2003 .. [KOEHN2010] Koehn, P. (2010) *Statistical Machine Translation*, Cambridge University Press nltk-3.1/nltk/test/translate_fixt.py0000644000076500000240000000014712607224144017410 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import from nltk.corpus import teardown_modulenltk-3.1/nltk/test/tree.doctest0000644000076500000240000011617312607224144016344 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT =============================== Unit tests for nltk.tree.Tree =============================== >>> from nltk.tree import * Some trees to run tests on: >>> dp1 = Tree('dp', [Tree('d', ['the']), Tree('np', ['dog'])]) >>> dp2 = Tree('dp', [Tree('d', ['the']), Tree('np', ['cat'])]) >>> vp = Tree('vp', [Tree('v', ['chased']), dp2]) >>> tree = Tree('s', [dp1, vp]) >>> print(tree) (s (dp (d the) (np dog)) (vp (v chased) (dp (d the) (np cat)))) The node label is accessed using the `label()` method: >>> dp1.label(), dp2.label(), vp.label(), tree.label() ('dp', 'dp', 'vp', 's') >>> print(tree[1,1,1,0]) cat The `treepositions` method returns a list of the tree positions of subtrees and leaves in a tree. By default, it gives the position of every tree, subtree, and leaf, in prefix order: >>> print(tree.treepositions()) [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0), (1, 1, 0, 0), (1, 1, 1), (1, 1, 1, 0)] In addition to `str` and `repr`, several methods exist to convert a tree object to one of several standard tree encodings: >>> print(tree.pformat_latex_qtree()) \Tree [.s [.dp [.d the ] [.np dog ] ] [.vp [.v chased ] [.dp [.d the ] [.np cat ] ] ] ] There is also a fancy ASCII art representation: >>> tree.pretty_print() s ________|_____ | vp | _____|___ dp | dp ___|___ | ___|___ d np v d np | | | | | the dog chased the cat >>> tree.pretty_print(unicodelines=True, nodedist=4) s ┌──────────────┴────────┠│ vp │ ┌────────┴──────┠dp │ dp ┌──────┴──────┠│ ┌──────┴──────┠d np v d np │ │ │ │ │ the dog chased the cat Trees can be initialized from treebank strings: >>> tree2 = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))') >>> print(tree2) (S (NP I) (VP (V enjoyed) (NP my cookie))) Trees can be compared for equality: >>> tree == Tree.fromstring(str(tree)) True >>> tree2 == Tree.fromstring(str(tree2)) True >>> tree == tree2 False >>> tree == Tree.fromstring(str(tree2)) False >>> tree2 == Tree.fromstring(str(tree)) False >>> tree != Tree.fromstring(str(tree)) False >>> tree2 != Tree.fromstring(str(tree2)) False >>> tree != tree2 True >>> tree != Tree.fromstring(str(tree2)) True >>> tree2 != Tree.fromstring(str(tree)) True >>> tree < tree2 or tree > tree2 True Tree Parsing ============ The class method `Tree.fromstring()` can be used to parse trees, and it provides some additional options. >>> tree = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))') >>> print(tree) (S (NP I) (VP (V enjoyed) (NP my cookie))) When called on a subclass of `Tree`, it will create trees of that type: >>> tree = ImmutableTree.fromstring('(VP (V enjoyed) (NP my cookie))') >>> print(tree) (VP (V enjoyed) (NP my cookie)) >>> print(type(tree)) >>> tree[1] = 'x' Traceback (most recent call last): . . . ValueError: ImmutableTree may not be modified >>> del tree[0] Traceback (most recent call last): . . . ValueError: ImmutableTree may not be modified The ``brackets`` parameter can be used to specify two characters that should be used as brackets: >>> print(Tree.fromstring('[S [NP I] [VP [V enjoyed] [NP my cookie]]]', ... brackets='[]')) (S (NP I) (VP (V enjoyed) (NP my cookie))) >>> print(Tree.fromstring(' >>', ... brackets='<>')) (S (NP I) (VP (V enjoyed) (NP my cookie))) If ``brackets`` is not a string, or is not exactly two characters, then `Tree.fromstring` raises an exception: >>> Tree.fromstring(' >', brackets='') Traceback (most recent call last): . . . TypeError: brackets must be a length-2 string >>> Tree.fromstring(' >', brackets='<<>>') Traceback (most recent call last): . . . TypeError: brackets must be a length-2 string >>> Tree.fromstring(' >', brackets=12) Traceback (most recent call last): . . . TypeError: brackets must be a length-2 string >>> Tree.fromstring('<>', brackets=('<<','>>')) Traceback (most recent call last): . . . TypeError: brackets must be a length-2 string (We may add support for multi-character brackets in the future, in which case the ``brackets=('<<','>>')`` example would start working.) Whitespace brackets are not permitted: >>> Tree.fromstring('(NP my cookie\n', brackets='(\n') Traceback (most recent call last): . . . TypeError: whitespace brackets not allowed If an invalid tree is given to Tree.fromstring, then it raises a ValueError, with a description of the problem: >>> Tree.fromstring('(NP my cookie) (NP my milk)') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected 'end-of-string' but got '(NP' at index 15. "...y cookie) (NP my mil..." ^ >>> Tree.fromstring(')NP my cookie(') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected '(' but got ')' at index 0. ")NP my coo..." ^ >>> Tree.fromstring('(NP my cookie))') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected 'end-of-string' but got ')' at index 14. "...my cookie))" ^ >>> Tree.fromstring('my cookie)') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected '(' but got 'my' at index 0. "my cookie)" ^ >>> Tree.fromstring('(NP my cookie') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected ')' but got 'end-of-string' at index 13. "... my cookie" ^ >>> Tree.fromstring('') Traceback (most recent call last): . . . ValueError: Tree.fromstring(): expected '(' but got 'end-of-string' at index 0. "" ^ Trees with no children are supported: >>> print(Tree.fromstring('(S)')) (S ) >>> print(Tree.fromstring('(X (Y) (Z))')) (X (Y ) (Z )) Trees with an empty node label and no children are supported: >>> print(Tree.fromstring('()')) ( ) >>> print(Tree.fromstring('(X () ())')) (X ( ) ( )) Trees with an empty node label and children are supported, but only if the first child is not a leaf (otherwise, it will be treated as the node label). >>> print(Tree.fromstring('((A) (B) (C))')) ( (A ) (B ) (C )) >>> print(Tree.fromstring('((A) leaf)')) ( (A ) leaf) >>> print(Tree.fromstring('(((())))')) ( ( ( ( )))) The optional arguments `read_node` and `read_leaf` may be used to transform the string values of nodes or leaves. >>> print(Tree.fromstring('(A b (C d e) (F (G h i)))', ... read_node=lambda s: '<%s>' % s, ... read_leaf=lambda s: '"%s"' % s)) (
    "b" ( "d" "e") ( ( "h" "i"))) These transformation functions are typically used when the node or leaf labels should be parsed to a non-string value (such as a feature structure). If node and leaf labels need to be able to include whitespace, then you must also use the optional `node_pattern` and `leaf_pattern` arguments. >>> from nltk.featstruct import FeatStruct >>> tree = Tree.fromstring('([cat=NP] [lex=the] [lex=dog])', ... read_node=FeatStruct, read_leaf=FeatStruct) >>> tree.set_label(tree.label().unify(FeatStruct('[num=singular]'))) >>> print(tree) ([cat='NP', num='singular'] [lex='the'] [lex='dog']) The optional argument ``remove_empty_top_bracketing`` can be used to remove any top-level empty bracketing that occurs. >>> print(Tree.fromstring('((S (NP I) (VP (V enjoyed) (NP my cookie))))', ... remove_empty_top_bracketing=True)) (S (NP I) (VP (V enjoyed) (NP my cookie))) It will not remove a top-level empty bracketing with multiple children: >>> print(Tree.fromstring('((A a) (B b))')) ( (A a) (B b)) Parented Trees ============== `ParentedTree` is a subclass of `Tree` that automatically maintains parent pointers for single-parented trees. Parented trees can be created directly from a node label and a list of children: >>> ptree = ( ... ParentedTree('VP', [ ... ParentedTree('VERB', ['saw']), ... ParentedTree('NP', [ ... ParentedTree('DET', ['the']), ... ParentedTree('NOUN', ['dog'])])])) >>> print(ptree) (VP (VERB saw) (NP (DET the) (NOUN dog))) Parented trees can be created from strings using the classmethod `ParentedTree.fromstring`: >>> ptree = ParentedTree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))') >>> print(ptree) (VP (VERB saw) (NP (DET the) (NOUN dog))) >>> print(type(ptree)) Parented trees can also be created by using the classmethod `ParentedTree.convert` to convert another type of tree to a parented tree: >>> tree = Tree.fromstring('(VP (VERB saw) (NP (DET the) (NOUN dog)))') >>> ptree = ParentedTree.convert(tree) >>> print(ptree) (VP (VERB saw) (NP (DET the) (NOUN dog))) >>> print(type(ptree)) .. clean-up: >>> del tree `ParentedTree`\ s should never be used in the same tree as `Tree`\ s or `MultiParentedTree`\ s. Mixing tree implementations may result in incorrect parent pointers and in `TypeError` exceptions: >>> # Inserting a Tree in a ParentedTree gives an exception: >>> ParentedTree('NP', [ ... Tree('DET', ['the']), Tree('NOUN', ['dog'])]) Traceback (most recent call last): . . . TypeError: Can not insert a non-ParentedTree into a ParentedTree >>> # inserting a ParentedTree in a Tree gives incorrect parent pointers: >>> broken_tree = Tree('NP', [ ... ParentedTree('DET', ['the']), ParentedTree('NOUN', ['dog'])]) >>> print(broken_tree[0].parent()) None Parented Tree Methods ------------------------ In addition to all the methods defined by the `Tree` class, the `ParentedTree` class adds six new methods whose values are automatically updated whenver a parented tree is modified: `parent()`, `parent_index()`, `left_sibling()`, `right_sibling()`, `root()`, and `treeposition()`. The `parent()` method contains a `ParentedTree`\ 's parent, if it has one; and ``None`` otherwise. `ParentedTree`\ s that do not have parents are known as "root trees." >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Parent = %s' % subtree.parent()) (VP (VERB saw) (NP (DET the) (NOUN dog))) Parent = None (VERB saw) Parent = (VP (VERB saw) (NP (DET the) (NOUN dog))) (NP (DET the) (NOUN dog)) Parent = (VP (VERB saw) (NP (DET the) (NOUN dog))) (DET the) Parent = (NP (DET the) (NOUN dog)) (NOUN dog) Parent = (NP (DET the) (NOUN dog)) The `parent_index()` method stores the index of a tree in its parent's child list. If a tree does not have a parent, then its `parent_index` is ``None``. >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Parent Index = %s' % subtree.parent_index()) ... assert (subtree.parent() is None or ... subtree.parent()[subtree.parent_index()] is subtree) (VP (VERB saw) (NP (DET the) (NOUN dog))) Parent Index = None (VERB saw) Parent Index = 0 (NP (DET the) (NOUN dog)) Parent Index = 1 (DET the) Parent Index = 0 (NOUN dog) Parent Index = 1 Note that ``ptree.parent().index(ptree)`` is *not* equivalent to ``ptree.parent_index()``. In particular, ``ptree.parent().index(ptree)`` will return the index of the first child of ``ptree.parent()`` that is equal to ``ptree`` (using ``==``); and that child may not be ``ptree``: >>> on_and_on = ParentedTree('CONJP', [ ... ParentedTree('PREP', ['on']), ... ParentedTree('COJN', ['and']), ... ParentedTree('PREP', ['on'])]) >>> second_on = on_and_on[2] >>> print(second_on.parent_index()) 2 >>> print(second_on.parent().index(second_on)) 0 The methods `left_sibling()` and `right_sibling()` can be used to get a parented tree's siblings. If a tree does not have a left or right sibling, then the corresponding method's value is ``None``: >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Left Sibling = %s' % subtree.left_sibling()) ... print(' Right Sibling = %s' % subtree.right_sibling()) (VP (VERB saw) (NP (DET the) (NOUN dog))) Left Sibling = None Right Sibling = None (VERB saw) Left Sibling = None Right Sibling = (NP (DET the) (NOUN dog)) (NP (DET the) (NOUN dog)) Left Sibling = (VERB saw) Right Sibling = None (DET the) Left Sibling = None Right Sibling = (NOUN dog) (NOUN dog) Left Sibling = (DET the) Right Sibling = None A parented tree's root tree can be accessed using the `root()` method. This method follows the tree's parent pointers until it finds a tree without a parent. If a tree does not have a parent, then it is its own root: >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Root = %s' % subtree.root()) (VP (VERB saw) (NP (DET the) (NOUN dog))) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) (VERB saw) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) (NP (DET the) (NOUN dog)) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) (DET the) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) (NOUN dog) Root = (VP (VERB saw) (NP (DET the) (NOUN dog))) The `treeposition()` method can be used to find a tree's treeposition relative to its root: >>> for subtree in ptree.subtrees(): ... print(subtree) ... print(' Tree Position = %s' % (subtree.treeposition(),)) ... assert subtree.root()[subtree.treeposition()] is subtree (VP (VERB saw) (NP (DET the) (NOUN dog))) Tree Position = () (VERB saw) Tree Position = (0,) (NP (DET the) (NOUN dog)) Tree Position = (1,) (DET the) Tree Position = (1, 0) (NOUN dog) Tree Position = (1, 1) Whenever a parented tree is modified, all of the methods described above (`parent()`, `parent_index()`, `left_sibling()`, `right_sibling()`, `root()`, and `treeposition()`) are automatically updated. For example, if we replace ``ptree``\ 's subtree for the word "dog" with a new subtree for "cat," the method values for both the "dog" subtree and the "cat" subtree get automatically updated: >>> # Replace the dog with a cat >>> dog = ptree[1,1] >>> cat = ParentedTree('NOUN', ['cat']) >>> ptree[1,1] = cat >>> # the noun phrase is no longer the dog's parent: >>> print(dog.parent(), dog.parent_index(), dog.left_sibling()) None None None >>> # dog is now its own root. >>> print(dog.root()) (NOUN dog) >>> print(dog.treeposition()) () >>> # the cat's parent is now the noun phrase: >>> print(cat.parent()) (NP (DET the) (NOUN cat)) >>> print(cat.parent_index()) 1 >>> print(cat.left_sibling()) (DET the) >>> print(cat.root()) (VP (VERB saw) (NP (DET the) (NOUN cat))) >>> print(cat.treeposition()) (1, 1) ParentedTree Regression Tests ----------------------------- Keep track of all trees that we create (including subtrees) using this variable: >>> all_ptrees = [] Define a helper funciton to create new parented trees: >>> def make_ptree(s): ... ptree = ParentedTree.convert(Tree.fromstring(s)) ... all_ptrees.extend(t for t in ptree.subtrees() ... if isinstance(t, Tree)) ... return ptree Define a test function that examines every subtree in all_ptrees; and checks that all six of its methods are defined correctly. If any ptrees are passed as arguments, then they are printed. >>> def pcheck(*print_ptrees): ... for ptree in all_ptrees: ... # Check ptree's methods. ... if ptree.parent() is not None: ... i = ptree.parent_index() ... assert ptree.parent()[i] is ptree ... if i > 0: ... assert ptree.left_sibling() is ptree.parent()[i-1] ... if i < (len(ptree.parent())-1): ... assert ptree.right_sibling() is ptree.parent()[i+1] ... assert len(ptree.treeposition()) > 0 ... assert (ptree.treeposition() == ... ptree.parent().treeposition() + (ptree.parent_index(),)) ... assert ptree.root() is not ptree ... assert ptree.root() is not None ... assert ptree.root() is ptree.parent().root() ... assert ptree.root()[ptree.treeposition()] is ptree ... else: ... assert ptree.parent_index() is None ... assert ptree.left_sibling() is None ... assert ptree.right_sibling() is None ... assert ptree.root() is ptree ... assert ptree.treeposition() == () ... # Check ptree's children's methods: ... for i, child in enumerate(ptree): ... if isinstance(child, Tree): ... # pcheck parent() & parent_index() methods ... assert child.parent() is ptree ... assert child.parent_index() == i ... # pcheck sibling methods ... if i == 0: ... assert child.left_sibling() is None ... else: ... assert child.left_sibling() is ptree[i-1] ... if i == len(ptree)-1: ... assert child.right_sibling() is None ... else: ... assert child.right_sibling() is ptree[i+1] ... if print_ptrees: ... print('ok!', end=' ') ... for ptree in print_ptrees: print(ptree) ... else: ... print('ok!') Run our test function on a variety of newly-created trees: >>> pcheck(make_ptree('(A)')) ok! (A ) >>> pcheck(make_ptree('(A (B (C (D) (E f)) g) h)')) ok! (A (B (C (D ) (E f)) g) h) >>> pcheck(make_ptree('(A (B) (C c) (D d d) (E e e e))')) ok! (A (B ) (C c) (D d d) (E e e e)) >>> pcheck(make_ptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))')) ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e ))) Run our test function after performing various tree-modification operations: **__delitem__()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> e = ptree[0,0,1] >>> del ptree[0,0,1]; pcheck(ptree); pcheck(e) ok! (A (B (C (D ) (Q p)) g) h) ok! (E f) >>> del ptree[0,0,0]; pcheck(ptree) ok! (A (B (C (Q p)) g) h) >>> del ptree[0,1]; pcheck(ptree) ok! (A (B (C (Q p))) h) >>> del ptree[-1]; pcheck(ptree) ok! (A (B (C (Q p)))) >>> del ptree[-100] Traceback (most recent call last): . . . IndexError: index out of range >>> del ptree[()] Traceback (most recent call last): . . . IndexError: The tree position () may not be deleted. >>> # With slices: >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') >>> b = ptree[0] >>> del ptree[0:0]; pcheck(ptree) ok! (A (B c) (D e) f g (H i) j (K l)) >>> del ptree[:1]; pcheck(ptree); pcheck(b) ok! (A (D e) f g (H i) j (K l)) ok! (B c) >>> del ptree[-2:]; pcheck(ptree) ok! (A (D e) f g (H i)) >>> del ptree[1:3]; pcheck(ptree) ok! (A (D e) (H i)) >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') >>> del ptree[5:1000]; pcheck(ptree) ok! (A (B c) (D e) f g (H i)) >>> del ptree[-2:1000]; pcheck(ptree) ok! (A (B c) (D e) f) >>> del ptree[-100:1]; pcheck(ptree) ok! (A (D e) f) >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') >>> del ptree[1:-2:2]; pcheck(ptree) ok! (A (B c) f (H i) j (K l)) **__setitem__()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> d, e, q = ptree[0,0] >>> ptree[0,0,0] = 'x'; pcheck(ptree); pcheck(d) ok! (A (B (C x (E f) (Q p)) g) h) ok! (D ) >>> ptree[0,0,1] = make_ptree('(X (Y z))'); pcheck(ptree); pcheck(e) ok! (A (B (C x (X (Y z)) (Q p)) g) h) ok! (E f) >>> ptree[1] = d; pcheck(ptree) ok! (A (B (C x (X (Y z)) (Q p)) g) (D )) >>> ptree[-1] = 'x'; pcheck(ptree) ok! (A (B (C x (X (Y z)) (Q p)) g) x) >>> ptree[-100] = 'y' Traceback (most recent call last): . . . IndexError: index out of range >>> ptree[()] = make_ptree('(X y)') Traceback (most recent call last): . . . IndexError: The tree position () may not be assigned to. >>> # With slices: >>> ptree = make_ptree('(A (B c) (D e) f g (H i) j (K l))') >>> b = ptree[0] >>> ptree[0:0] = ('x', make_ptree('(Y)')); pcheck(ptree) ok! (A x (Y ) (B c) (D e) f g (H i) j (K l)) >>> ptree[2:6] = (); pcheck(ptree); pcheck(b) ok! (A x (Y ) (H i) j (K l)) ok! (B c) >>> ptree[-2:] = ('z', 'p'); pcheck(ptree) ok! (A x (Y ) (H i) z p) >>> ptree[1:3] = [make_ptree('(X)') for x in range(10)]; pcheck(ptree) ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p) >>> ptree[5:1000] = []; pcheck(ptree) ok! (A x (X ) (X ) (X ) (X )) >>> ptree[-2:1000] = ['n']; pcheck(ptree) ok! (A x (X ) (X ) n) >>> ptree[-100:1] = [make_ptree('(U v)')]; pcheck(ptree) ok! (A (U v) (X ) (X ) n) >>> ptree[-1:] = (make_ptree('(X)') for x in range(3)); pcheck(ptree) ok! (A (U v) (X ) (X ) (X ) (X ) (X )) >>> ptree[1:-2:2] = ['x', 'y']; pcheck(ptree) ok! (A (U v) x (X ) y (X ) (X )) **append()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> ptree.append('x'); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x) >>> ptree.append(make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z))) **extend()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> ptree.extend(['x', 'y', make_ptree('(X (Y z))')]); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) >>> ptree.extend([]); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) >>> ptree.extend(make_ptree('(X)') for x in range(3)); pcheck(ptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X )) **insert()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> ptree.insert(0, make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h) >>> ptree.insert(-1, make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) >>> ptree.insert(-4, make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) >>> # Note: as with ``list``, inserting at a negative index that >>> # gives a position before the start of the list does *not* >>> # raise an IndexError exception; it just inserts at 0. >>> ptree.insert(-400, make_ptree('(X (Y z))')); pcheck(ptree) ok! (A (X (Y z)) (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) **pop()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> ptree[0,0].pop(1); pcheck(ptree) ParentedTree('E', ['f']) ok! (A (B (C (D ) (Q p)) g) h) >>> ptree[0].pop(-1); pcheck(ptree) 'g' ok! (A (B (C (D ) (Q p))) h) >>> ptree.pop(); pcheck(ptree) 'h' ok! (A (B (C (D ) (Q p)))) >>> ptree.pop(-100) Traceback (most recent call last): . . . IndexError: index out of range **remove()** >>> ptree = make_ptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> e = ptree[0,0,1] >>> ptree[0,0].remove(ptree[0,0,1]); pcheck(ptree); pcheck(e) ok! (A (B (C (D ) (Q p)) g) h) ok! (E f) >>> ptree[0,0].remove(make_ptree('(Q p)')); pcheck(ptree) ok! (A (B (C (D )) g) h) >>> ptree[0,0].remove(make_ptree('(Q p)')) Traceback (most recent call last): . . . ValueError: ParentedTree('Q', ['p']) is not in list >>> ptree.remove('h'); pcheck(ptree) ok! (A (B (C (D )) g)) >>> ptree.remove('h'); Traceback (most recent call last): . . . ValueError: 'h' is not in list >>> # remove() removes the first subtree that is equal (==) to the >>> # given tree, which may not be the identical tree we give it: >>> ptree = make_ptree('(A (X x) (Y y) (X x))') >>> x1, y, x2 = ptree >>> ptree.remove(ptree[-1]); pcheck(ptree) ok! (A (Y y) (X x)) >>> print(x1.parent()); pcheck(x1) None ok! (X x) >>> print(x2.parent()) (A (Y y) (X x)) Test that a tree can not be given multiple parents: >>> ptree = make_ptree('(A (X x) (Y y) (Z z))') >>> ptree[0] = ptree[1] Traceback (most recent call last): . . . ValueError: Can not insert a subtree that already has a parent. >>> pcheck() ok! [more to be written] ImmutableParentedTree Regression Tests -------------------------------------- >>> iptree = ImmutableParentedTree.convert(ptree) >>> type(iptree) >>> del iptree[0] Traceback (most recent call last): . . . ValueError: ImmutableParentedTree may not be modified >>> iptree.set_label('newnode') Traceback (most recent call last): . . . ValueError: ImmutableParentedTree may not be modified MultiParentedTree Regression Tests ---------------------------------- Keep track of all trees that we create (including subtrees) using this variable: >>> all_mptrees = [] Define a helper funciton to create new parented trees: >>> def make_mptree(s): ... mptree = MultiParentedTree.convert(Tree.fromstring(s)) ... all_mptrees.extend(t for t in mptree.subtrees() ... if isinstance(t, Tree)) ... return mptree Define a test function that examines every subtree in all_mptrees; and checks that all six of its methods are defined correctly. If any mptrees are passed as arguments, then they are printed. >>> def mpcheck(*print_mptrees): ... def has(seq, val): # uses identity comparison ... for item in seq: ... if item is val: return True ... return False ... for mptree in all_mptrees: ... # Check mptree's methods. ... if len(mptree.parents()) == 0: ... assert len(mptree.left_siblings()) == 0 ... assert len(mptree.right_siblings()) == 0 ... assert len(mptree.roots()) == 1 ... assert mptree.roots()[0] is mptree ... assert mptree.treepositions(mptree) == [()] ... left_siblings = right_siblings = () ... roots = {id(mptree): 1} ... else: ... roots = dict((id(r), 0) for r in mptree.roots()) ... left_siblings = mptree.left_siblings() ... right_siblings = mptree.right_siblings() ... for parent in mptree.parents(): ... for i in mptree.parent_indices(parent): ... assert parent[i] is mptree ... # check left siblings ... if i > 0: ... for j in range(len(left_siblings)): ... if left_siblings[j] is parent[i-1]: ... del left_siblings[j] ... break ... else: ... assert 0, 'sibling not found!' ... # check ight siblings ... if i < (len(parent)-1): ... for j in range(len(right_siblings)): ... if right_siblings[j] is parent[i+1]: ... del right_siblings[j] ... break ... else: ... assert 0, 'sibling not found!' ... # check roots ... for root in parent.roots(): ... assert id(root) in roots, 'missing root' ... roots[id(root)] += 1 ... # check that we don't have any unexplained values ... assert len(left_siblings)==0, 'unexpected sibling' ... assert len(right_siblings)==0, 'unexpected sibling' ... for v in roots.values(): assert v>0, roots #'unexpected root' ... # check treepositions ... for root in mptree.roots(): ... for treepos in mptree.treepositions(root): ... assert root[treepos] is mptree ... # Check mptree's children's methods: ... for i, child in enumerate(mptree): ... if isinstance(child, Tree): ... # mpcheck parent() & parent_index() methods ... assert has(child.parents(), mptree) ... assert i in child.parent_indices(mptree) ... # mpcheck sibling methods ... if i > 0: ... assert has(child.left_siblings(), mptree[i-1]) ... if i < len(mptree)-1: ... assert has(child.right_siblings(), mptree[i+1]) ... if print_mptrees: ... print('ok!', end=' ') ... for mptree in print_mptrees: print(mptree) ... else: ... print('ok!') Run our test function on a variety of newly-created trees: >>> mpcheck(make_mptree('(A)')) ok! (A ) >>> mpcheck(make_mptree('(A (B (C (D) (E f)) g) h)')) ok! (A (B (C (D ) (E f)) g) h) >>> mpcheck(make_mptree('(A (B) (C c) (D d d) (E e e e))')) ok! (A (B ) (C c) (D d d) (E e e e)) >>> mpcheck(make_mptree('(A (B) (C (c)) (D (d) (d)) (E (e) (e) (e)))')) ok! (A (B ) (C (c )) (D (d ) (d )) (E (e ) (e ) (e ))) >>> subtree = make_mptree('(A (B (C (D) (E f)) g) h)') Including some trees that contain multiple parents: >>> mpcheck(MultiParentedTree('Z', [subtree, subtree])) ok! (Z (A (B (C (D ) (E f)) g) h) (A (B (C (D ) (E f)) g) h)) Run our test function after performing various tree-modification operations (n.b., these are the same tests that we ran for `ParentedTree`, above; thus, none of these trees actually *uses* multiple parents.) **__delitem__()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> e = mptree[0,0,1] >>> del mptree[0,0,1]; mpcheck(mptree); mpcheck(e) ok! (A (B (C (D ) (Q p)) g) h) ok! (E f) >>> del mptree[0,0,0]; mpcheck(mptree) ok! (A (B (C (Q p)) g) h) >>> del mptree[0,1]; mpcheck(mptree) ok! (A (B (C (Q p))) h) >>> del mptree[-1]; mpcheck(mptree) ok! (A (B (C (Q p)))) >>> del mptree[-100] Traceback (most recent call last): . . . IndexError: index out of range >>> del mptree[()] Traceback (most recent call last): . . . IndexError: The tree position () may not be deleted. >>> # With slices: >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') >>> b = mptree[0] >>> del mptree[0:0]; mpcheck(mptree) ok! (A (B c) (D e) f g (H i) j (K l)) >>> del mptree[:1]; mpcheck(mptree); mpcheck(b) ok! (A (D e) f g (H i) j (K l)) ok! (B c) >>> del mptree[-2:]; mpcheck(mptree) ok! (A (D e) f g (H i)) >>> del mptree[1:3]; mpcheck(mptree) ok! (A (D e) (H i)) >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') >>> del mptree[5:1000]; mpcheck(mptree) ok! (A (B c) (D e) f g (H i)) >>> del mptree[-2:1000]; mpcheck(mptree) ok! (A (B c) (D e) f) >>> del mptree[-100:1]; mpcheck(mptree) ok! (A (D e) f) >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') >>> del mptree[1:-2:2]; mpcheck(mptree) ok! (A (B c) f (H i) j (K l)) **__setitem__()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> d, e, q = mptree[0,0] >>> mptree[0,0,0] = 'x'; mpcheck(mptree); mpcheck(d) ok! (A (B (C x (E f) (Q p)) g) h) ok! (D ) >>> mptree[0,0,1] = make_mptree('(X (Y z))'); mpcheck(mptree); mpcheck(e) ok! (A (B (C x (X (Y z)) (Q p)) g) h) ok! (E f) >>> mptree[1] = d; mpcheck(mptree) ok! (A (B (C x (X (Y z)) (Q p)) g) (D )) >>> mptree[-1] = 'x'; mpcheck(mptree) ok! (A (B (C x (X (Y z)) (Q p)) g) x) >>> mptree[-100] = 'y' Traceback (most recent call last): . . . IndexError: index out of range >>> mptree[()] = make_mptree('(X y)') Traceback (most recent call last): . . . IndexError: The tree position () may not be assigned to. >>> # With slices: >>> mptree = make_mptree('(A (B c) (D e) f g (H i) j (K l))') >>> b = mptree[0] >>> mptree[0:0] = ('x', make_mptree('(Y)')); mpcheck(mptree) ok! (A x (Y ) (B c) (D e) f g (H i) j (K l)) >>> mptree[2:6] = (); mpcheck(mptree); mpcheck(b) ok! (A x (Y ) (H i) j (K l)) ok! (B c) >>> mptree[-2:] = ('z', 'p'); mpcheck(mptree) ok! (A x (Y ) (H i) z p) >>> mptree[1:3] = [make_mptree('(X)') for x in range(10)]; mpcheck(mptree) ok! (A x (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) (X ) z p) >>> mptree[5:1000] = []; mpcheck(mptree) ok! (A x (X ) (X ) (X ) (X )) >>> mptree[-2:1000] = ['n']; mpcheck(mptree) ok! (A x (X ) (X ) n) >>> mptree[-100:1] = [make_mptree('(U v)')]; mpcheck(mptree) ok! (A (U v) (X ) (X ) n) >>> mptree[-1:] = (make_mptree('(X)') for x in range(3)); mpcheck(mptree) ok! (A (U v) (X ) (X ) (X ) (X ) (X )) >>> mptree[1:-2:2] = ['x', 'y']; mpcheck(mptree) ok! (A (U v) x (X ) y (X ) (X )) **append()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> mptree.append('x'); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x) >>> mptree.append(make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x (X (Y z))) **extend()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> mptree.extend(['x', 'y', make_mptree('(X (Y z))')]); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) >>> mptree.extend([]); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z))) >>> mptree.extend(make_mptree('(X)') for x in range(3)); mpcheck(mptree) ok! (A (B (C (D ) (E f) (Q p)) g) h x y (X (Y z)) (X ) (X ) (X )) **insert()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> mptree.insert(0, make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) h) >>> mptree.insert(-1, make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) >>> mptree.insert(-4, make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) >>> # Note: as with ``list``, inserting at a negative index that >>> # gives a position before the start of the list does *not* >>> # raise an IndexError exception; it just inserts at 0. >>> mptree.insert(-400, make_mptree('(X (Y z))')); mpcheck(mptree) ok! (A (X (Y z)) (X (Y z)) (X (Y z)) (B (C (D ) (E f) (Q p)) g) (X (Y z)) h) **pop()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> mptree[0,0].pop(1); mpcheck(mptree) MultiParentedTree('E', ['f']) ok! (A (B (C (D ) (Q p)) g) h) >>> mptree[0].pop(-1); mpcheck(mptree) 'g' ok! (A (B (C (D ) (Q p))) h) >>> mptree.pop(); mpcheck(mptree) 'h' ok! (A (B (C (D ) (Q p)))) >>> mptree.pop(-100) Traceback (most recent call last): . . . IndexError: index out of range **remove()** >>> mptree = make_mptree('(A (B (C (D) (E f) (Q p)) g) h)') >>> e = mptree[0,0,1] >>> mptree[0,0].remove(mptree[0,0,1]); mpcheck(mptree); mpcheck(e) ok! (A (B (C (D ) (Q p)) g) h) ok! (E f) >>> mptree[0,0].remove(make_mptree('(Q p)')); mpcheck(mptree) ok! (A (B (C (D )) g) h) >>> mptree[0,0].remove(make_mptree('(Q p)')) Traceback (most recent call last): . . . ValueError: MultiParentedTree('Q', ['p']) is not in list >>> mptree.remove('h'); mpcheck(mptree) ok! (A (B (C (D )) g)) >>> mptree.remove('h'); Traceback (most recent call last): . . . ValueError: 'h' is not in list >>> # remove() removes the first subtree that is equal (==) to the >>> # given tree, which may not be the identical tree we give it: >>> mptree = make_mptree('(A (X x) (Y y) (X x))') >>> x1, y, x2 = mptree >>> mptree.remove(mptree[-1]); mpcheck(mptree) ok! (A (Y y) (X x)) >>> print([str(p) for p in x1.parents()]) [] >>> print([str(p) for p in x2.parents()]) ['(A (Y y) (X x))'] ImmutableMultiParentedTree Regression Tests ------------------------------------------- >>> imptree = ImmutableMultiParentedTree.convert(mptree) >>> type(imptree) >>> del imptree[0] Traceback (most recent call last): . . . ValueError: ImmutableMultiParentedTree may not be modified >>> imptree.set_label('newnode') Traceback (most recent call last): . . . ValueError: ImmutableMultiParentedTree may not be modified ProbabilisticTree Regression Tests ---------------------------------- >>> prtree = ProbabilisticTree("S", [ProbabilisticTree("NP", ["N"], prob=0.3)], prob=0.6) >>> print(prtree) (S (NP N)) (p=0.6) >>> import copy >>> prtree == copy.deepcopy(prtree) == prtree.copy(deep=True) == prtree.copy() True >>> prtree[0] is prtree.copy()[0] True >>> prtree[0] is prtree.copy(deep=True)[0] False >>> imprtree = ImmutableProbabilisticTree.convert(prtree) >>> type(imprtree) >>> del imprtree[0] Traceback (most recent call last): . . . ValueError: ImmutableProbabilisticTree may not be modified >>> imprtree.set_label('newnode') Traceback (most recent call last): . . . ValueError: ImmutableProbabilisticTree may not be modified Squashed Bugs ============= This used to discard the ``(B b)`` subtree (fixed in svn 6270): >>> print(Tree.fromstring('((A a) (B b))')) ( (A a) (B b)) nltk-3.1/nltk/test/treeprettyprinter.doctest0000644000076500000240000002033312607224144021210 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ======================================================== Unit tests for nltk.treeprettyprinter.TreePrettyPrinter ======================================================== >>> from nltk.tree import Tree >>> from nltk.treeprettyprinter import TreePrettyPrinter Tree nr 2170 from nltk.corpus.treebank: >>> tree = Tree.fromstring( ... '(S (NP-SBJ (PRP I)) (VP (VBP feel) (ADJP-PRD (RB pretty) ' ... '(JJ good)) (PP-CLR (IN about) (NP (PRP it)))) (. .))') >>> tpp = TreePrettyPrinter(tree) >>> print(tpp.text()) S __________________________|_____________________ | VP | | ____________________|___________ | | | | PP-CLR | | | | _____|_____ | NP-SBJ | ADJP-PRD | NP | | | _______|______ | | | PRP VBP RB JJ IN PRP . | | | | | | | I feel pretty good about it . >>> print(tpp.text(unicodelines=True)) S ┌──────────────────────────┼─────────────────────┠│ VP │ │ ┌─────────────┬──────┴───────────┠│ │ │ │ PP-CLR │ │ │ │ ┌─────┴─────┠│ NP-SBJ │ ADJP-PRD │ NP │ │ │ ┌───────┴──────┠│ │ │ PRP VBP RB JJ IN PRP . │ │ │ │ │ │ │ I feel pretty good about it . A tree with long labels: >>> tree = Tree.fromstring( ... '(sentence (plural-noun-phrase (plural-noun Superconductors)) ' ... '(verb-phrase (plural-verb conduct) ' ... '(noun-phrase (singular-noun electricity))))') >>> tpp = TreePrettyPrinter(tree) >>> print(tpp.text(abbreviate=8, nodedist=2)) sentence __________|__________ | verb-phr. | __________|__________ plural-n. | noun-phr. | | | plural-n. plural-v. singular. | | | Supercon. conduct electric. >>> print(tpp.text(maxwidth=8, nodedist=2)) sentence _________|________ | verb- | phrase | ________|_________ plural- | noun- noun- | phrase phrase | | | | | plural- plural- singular- noun verb noun | | | Supercon conduct electric ductors ity A discontinuous tree: >>> tree = Tree.fromstring( ... '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) ' ... '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) ' ... '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int) >>> sentence = ('Ze had met haar moeder kunnen gaan winkelen ,' ... ' zwemmen of terrassen .'.split()) >>> tpp = TreePrettyPrinter(tree, sentence) >>> print(tpp.text()) top _____|______________________________________________ smain | | _______________________________|_____ | | | | inf | | | | _____|____ | | | | | inf | | | | | ____|_____ | | | | | | conj | | | | _____ | ___ | _________|______ | __________________ | | | inf | | | | | | | | | _________|_____ | ___ | _________ | | | | | | | pp | | | | | | | | | | ____|____ | | | | | | | | | | | np | | | | inf | inf | | | | ____|____ | | | | | | | | noun verb prep det noun verb verb verb punct verb vg verb punct | | | | | | | | | | | | | Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen . >>> print(tpp.text(unicodelines=True)) top ┌─────┴──────────────────┬───────────────────────────┠smain │ │ ┌────┬──────────────────────────┴─────┠│ │ │ │ inf │ │ │ │ ┌─────┴────┠│ │ │ │ │ inf │ │ │ │ │ ┌────┴─────┠│ │ │ │ │ │ conj │ │ │ │ ┌───── │ ─── │ ─────────┴────── │ ─────┬─────┬──────┠│ │ │ inf │ │ │ │ │ │ │ │ │ ┌─────────┴───── │ ─── │ ─────────┠│ │ │ │ │ │ │ pp │ │ │ │ │ │ │ │ │ │ ┌────┴────┠│ │ │ │ │ │ │ │ │ │ │ np │ │ │ │ inf │ inf │ │ │ │ ┌────┴────┠│ │ │ │ │ │ │ │ noun verb prep det noun verb verb verb punct verb vg verb punct │ │ │ │ │ │ │ │ │ │ │ │ │ Ze had met haar moeder kunnen gaan winkelen , zwemmen of terrassen . nltk-3.1/nltk/test/treetransforms.doctest0000644000076500000240000001133712607224144020457 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ------------------------------------------- Unit tests for the TreeTransformation class ------------------------------------------- >>> from copy import deepcopy >>> from nltk.tree import * >>> from nltk.treetransforms import * >>> tree_string = "(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))" >>> tree = Tree.fromstring(tree_string) >>> print(tree) (TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .))) Make a copy of the original tree and collapse the subtrees with only one child >>> collapsedTree = deepcopy(tree) >>> collapse_unary(collapsedTree) >>> print(collapsedTree) (TOP (S (S+VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room)))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .))) >>> collapsedTree2 = deepcopy(tree) >>> collapse_unary(collapsedTree2, collapsePOS=True, collapseRoot=True) >>> print(collapsedTree2) (TOP+S (S+VP (VBN Turned) (ADVP+RB loose) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room)))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP+RB little) (ADJP+RB right))) (. .)) Convert the tree to Chomsky Normal Form i.e. each subtree has either two subtree children or a single leaf value. This conversion can be performed using either left- or right-factoring. >>> cnfTree = deepcopy(collapsedTree) >>> chomsky_normal_form(cnfTree, factor='left') >>> print(cnfTree) (TOP (S (S| (S| (S| (S+VP (S+VP| (VBN Turned) (ADVP (RB loose))) (PP (IN in) (NP (NP| (NP (NP| (NNP Shane) (NNP Longman)) (POS 's)) (NN trading)) (NN room)))) (, ,)) (NP (NP| (DT the) (NN yuppie)) (NNS dealers))) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right))))) (. .))) >>> cnfTree = deepcopy(collapsedTree) >>> chomsky_normal_form(cnfTree, factor='right') >>> print(cnfTree) (TOP (S (S+VP (VBN Turned) (S+VP| (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NP| (NNP Longman) (POS 's))) (NP| (NN trading) (NN room)))))) (S|<,-NP-VP-.> (, ,) (S| (NP (DT the) (NP| (NN yuppie) (NNS dealers))) (S| (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))))) Employ some Markov smoothing to make the artificial node labels a bit more readable. See the treetransforms.py documentation for more details. >>> markovTree = deepcopy(collapsedTree) >>> chomsky_normal_form(markovTree, horzMarkov=2, vertMarkov=1) >>> print(markovTree) (TOP (S^ (S+VP^ (VBN Turned) (S+VP|^ (ADVP^ (RB loose)) (PP^ (IN in) (NP^ (NP^ (NNP Shane) (NP|^ (NNP Longman) (POS 's))) (NP|^ (NN trading) (NN room)))))) (S|<,-NP>^ (, ,) (S|^ (NP^ (DT the) (NP|^ (NN yuppie) (NNS dealers))) (S|^ (VP^ (AUX do) (NP^ (NP^ (RB little)) (ADJP^ (RB right)))) (. .)))))) Convert the transformed tree back to its original form >>> un_chomsky_normal_form(markovTree) >>> tree == markovTree True nltk-3.1/nltk/test/unit/0000755000076500000240000000000012610001541014750 5ustar sbstaff00000000000000nltk-3.1/nltk/test/unit/__init__.py0000644000076500000240000000000012574600335017066 0ustar sbstaff00000000000000nltk-3.1/nltk/test/unit/test_2x_compat.py0000644000076500000240000000133212574600335020273 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Unit tests for nltk.compat. See also nltk/test/compat.doctest. """ from __future__ import absolute_import, unicode_literals import unittest from nltk.text import Text from nltk.compat import PY3, python_2_unicode_compatible def setup_module(module): from nose import SkipTest if PY3: raise SkipTest("test_2x_compat is for testing nltk.compat under Python 2.x") class TestTextTransliteration(unittest.TestCase): txt = Text(["São", "Tomé", "and", "Príncipe"]) def test_repr(self): self.assertEqual(repr(self.txt), br"") def test_str(self): self.assertEqual(str(self.txt), b"") nltk-3.1/nltk/test/unit/test_classify.py0000644000076500000240000000253212574600335020217 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Unit tests for nltk.classify. See also: nltk/test/classify.doctest """ from __future__ import absolute_import from nose import SkipTest from nltk import classify TRAIN = [ (dict(a=1,b=1,c=1), 'y'), (dict(a=1,b=1,c=1), 'x'), (dict(a=1,b=1,c=0), 'y'), (dict(a=0,b=1,c=1), 'x'), (dict(a=0,b=1,c=1), 'y'), (dict(a=0,b=0,c=1), 'y'), (dict(a=0,b=1,c=0), 'x'), (dict(a=0,b=0,c=0), 'x'), (dict(a=0,b=1,c=1), 'y'), ] TEST = [ (dict(a=1,b=0,c=1)), # unseen (dict(a=1,b=0,c=0)), # unseen (dict(a=0,b=1,c=1)), # seen 3 times, labels=y,y,x (dict(a=0,b=1,c=0)), # seen 1 time, label=x ] RESULTS = [ (0.16, 0.84), (0.46, 0.54), (0.41, 0.59), (0.76, 0.24), ] def assert_classifier_correct(algorithm): try: classifier = classify.MaxentClassifier.train( TRAIN, algorithm, trace=0, max_iter=1000 ) except (LookupError, AttributeError) as e: raise SkipTest(str(e)) for (px, py), featureset in zip(RESULTS, TEST): pdist = classifier.prob_classify(featureset) assert abs(pdist.prob('x') - px) < 1e-2, (pdist.prob('x'), px) assert abs(pdist.prob('y') - py) < 1e-2, (pdist.prob('y'), py) def test_megam(): assert_classifier_correct('MEGAM') def test_tadm(): assert_classifier_correct('TADM') nltk-3.1/nltk/test/unit/test_collocations.py0000644000076500000240000000700212574600335021070 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import, unicode_literals import unittest from nltk.collocations import BigramCollocationFinder from nltk.metrics import BigramAssocMeasures ## Test bigram counters with discontinuous bigrams and repeated words _EPSILON = 1e-8 def close_enough(x, y): """Verify that two sequences of n-gram association values are within _EPSILON of each other. """ for (x1, y1) in zip(x, y): if x1[0] != y1[0] or abs(x1[1] - y1[1]) > _EPSILON: return False return True class TestBigram(unittest.TestCase): def test_bigram2(self): sent = 'this this is is a a test test'.split() b = BigramCollocationFinder.from_words(sent) #python 2.6 does not have assertItemsEqual or assertListEqual self.assertEqual( sorted(b.ngram_fd.items()), sorted([(('a', 'a'), 1), (('a', 'test'), 1), (('is', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'is'), 1), (('this', 'this'), 1)]) ) self.assertEqual( sorted(b.word_fd.items()), sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]) ) self.assertTrue(len(sent) == sum(b.word_fd.values()) == sum(b.ngram_fd.values()) + 1) self.assertTrue(close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted([(('a', 'a'), 1.0), (('a', 'test'), 1.0), (('is', 'a'), 1.0), (('is', 'is'), 1.0), (('test', 'test'), 1.0), (('this', 'is'), 1.0), (('this', 'this'), 1.0)]) )) def test_bigram3(self): sent = 'this this is is a a test test'.split() b = BigramCollocationFinder.from_words(sent, window_size=3) self.assertEqual( sorted(b.ngram_fd.items()), sorted([(('a', 'test'), 3), (('is', 'a'), 3), (('this', 'is'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1)]) ) self.assertEqual( sorted(b.word_fd.items()), sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]) ) self.assertTrue(len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 2 + 1) / 2.0) self.assertTrue(close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted([(('a', 'test'), 1.584962500721156), (('is', 'a'), 1.584962500721156), (('this', 'is'), 1.584962500721156), (('a', 'a'), 0.0), (('is', 'is'), 0.0), (('test', 'test'), 0.0), (('this', 'this'), 0.0)]) )) def test_bigram5(self): sent = 'this this is is a a test test'.split() b = BigramCollocationFinder.from_words(sent, window_size=5) self.assertEqual( sorted(b.ngram_fd.items()), sorted([(('a', 'test'), 4), (('is', 'a'), 4), (('this', 'is'), 4), (('is', 'test'), 3), (('this', 'a'), 3), (('a', 'a'), 1), (('is', 'is'), 1), (('test', 'test'), 1), (('this', 'this'), 1)]) ) self.assertEqual( sorted(b.word_fd.items()), sorted([('a', 2), ('is', 2), ('test', 2), ('this', 2)]) ) self.assertTrue(len(sent) == sum(b.word_fd.values()) == (sum(b.ngram_fd.values()) + 4 + 3 + 2 + 1) / 4.0) self.assertTrue(close_enough( sorted(b.score_ngrams(BigramAssocMeasures.pmi)), sorted([(('a', 'test'), 1.0), (('is', 'a'), 1.0), (('this', 'is'), 1.0), (('is', 'test'), 0.5849625007211562), (('this', 'a'), 0.5849625007211562), (('a', 'a'), -1.0), (('is', 'is'), -1.0), (('test', 'test'), -1.0), (('this', 'this'), -1.0)]) )) nltk-3.1/nltk/test/unit/test_corpora.py0000644000076500000240000001377312607224144020055 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import, unicode_literals import unittest from nltk.corpus import (sinica_treebank, conll2007, indian, cess_cat, cess_esp, floresta, ptb, udhr) from nltk.tree import Tree from nltk.test.unit.utils import skipIf class TestUdhr(unittest.TestCase): def test_words(self): for name in udhr.fileids(): try: words = list(udhr.words(name)) except AssertionError: print(name) raise self.assertTrue(words) def test_raw_unicode(self): for name in udhr.fileids(): txt = udhr.raw(name) assert not isinstance(txt, bytes), name class TestIndian(unittest.TestCase): def test_words(self): words = indian.words()[:3] self.assertEqual(words, ['মহিষের', 'সনà§à¦¤à¦¾à¦¨', ':']) def test_tagged_words(self): tagged_words = indian.tagged_words()[:3] self.assertEqual(tagged_words, [('মহিষের', 'NN'), ('সনà§à¦¤à¦¾à¦¨', 'NN'), (':', 'SYM')]) class TestCess(unittest.TestCase): def test_catalan(self): words = cess_cat.words()[:15] txt = "El Tribunal_Suprem -Fpa- TS -Fpt- ha confirmat la condemna a quatre anys d' inhabilitació especial" self.assertEqual(words, txt.split()) self.assertEqual(cess_cat.tagged_sents()[0][34][0], "càrrecs") def test_esp(self): words = cess_esp.words()[:15] txt = "El grupo estatal Electricité_de_France -Fpa- EDF -Fpt- anunció hoy , jueves , la compra del" self.assertEqual(words, txt.split()) self.assertEqual(cess_esp.words()[115], "años") class TestFloresta(unittest.TestCase): def test_words(self): words = floresta.words()[:10] txt = "Um revivalismo refrescante O 7_e_Meio é um ex-libris de a" self.assertEqual(words, txt.split()) class TestSinicaTreebank(unittest.TestCase): def test_sents(self): first_3_sents = sinica_treebank.sents()[:3] self.assertEqual( first_3_sents, [['一'], ['勿ƒ…'], ['嘉ç', 'å’Œ', '我', 'ä½åœ¨', 'åŒä¸€æ¢', 'å··å­']] ) def test_parsed_sents(self): parsed_sents = sinica_treebank.parsed_sents()[25] self.assertEqual(parsed_sents, Tree('S', [ Tree('NP', [ Tree('Nba', ['嘉ç']) ]), Tree('V‧地', [ Tree('VA11', ['ä¸åœ']), Tree('DE', ['çš„']) ]), Tree('VA4', ['å“­æ³£']) ])) class TestCoNLL2007(unittest.TestCase): # Reading the CoNLL 2007 Dependency Treebanks def test_sents(self): sents = conll2007.sents('esp.train')[0] self.assertEqual( sents[:6], ['El', 'aumento', 'del', 'índice', 'de', 'desempleo'] ) def test_parsed_sents(self): parsed_sents = conll2007.parsed_sents('esp.train')[0] self.assertEqual(parsed_sents.tree(), Tree('fortaleció', [ Tree('aumento', [ 'El', Tree('del', [ Tree('índice', [ Tree('de', [ Tree('desempleo', ['estadounidense']) ]) ]) ]) ]), 'hoy', 'considerablemente', Tree('al', [ Tree('euro', [ Tree('cotizaba', [ ',', 'que', Tree('a', [ Tree('15.35', ['las', 'GMT']) ]), 'se', Tree('en', [ Tree('mercado', [ 'el', Tree('de', ['divisas']), Tree('de', ['Fráncfort']) ]) ]), Tree('a', ['0,9452_dólares']), Tree('frente_a', [ ',', Tree('0,9349_dólares', [ 'los', Tree('de', [ Tree('mañana', ['esta']) ]) ]) ]) ]) ]) ]), '.' ]) ) @skipIf(not ptb.fileids(), "A full installation of the Penn Treebank is not available") class TestPTB(unittest.TestCase): def test_fileids(self): self.assertEqual( ptb.fileids()[:4], ['BROWN/CF/CF01.MRG', 'BROWN/CF/CF02.MRG', 'BROWN/CF/CF03.MRG', 'BROWN/CF/CF04.MRG'] ) def test_words(self): self.assertEqual( ptb.words('WSJ/00/WSJ_0003.MRG')[:7], ['A', 'form', 'of', 'asbestos', 'once', 'used', '*'] ) def test_tagged_words(self): self.assertEqual( ptb.tagged_words('WSJ/00/WSJ_0003.MRG')[:3], [('A', 'DT'), ('form', 'NN'), ('of', 'IN')] ) def test_categories(self): self.assertEqual( ptb.categories(), ['adventure', 'belles_lettres', 'fiction', 'humor', 'lore', 'mystery', 'news', 'romance', 'science_fiction'] ) def test_news_fileids(self): self.assertEqual( ptb.fileids('news')[:3], ['WSJ/00/WSJ_0001.MRG', 'WSJ/00/WSJ_0002.MRG', 'WSJ/00/WSJ_0003.MRG'] ) def test_category_words(self): self.assertEqual( ptb.words(categories=['humor','fiction'])[:6], ['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back'] ) # unload corpora from nltk.corpus import teardown_module nltk-3.1/nltk/test/unit/test_corpus_views.py0000644000076500000240000000316212574600335021132 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Corpus View Regression Tests """ from __future__ import absolute_import, unicode_literals import unittest import nltk.data from nltk.corpus.reader.util import (StreamBackedCorpusView, read_whitespace_block, read_line_block) class TestCorpusViews(unittest.TestCase): linetok = nltk.LineTokenizer(blanklines='keep') names = [ 'corpora/inaugural/README', # A very short file (160 chars) 'corpora/inaugural/1793-Washington.txt', # A relatively short file (791 chars) 'corpora/inaugural/1909-Taft.txt', # A longer file (32k chars) ] def data(self): for name in self.names: f = nltk.data.find(name) with f.open() as fp: file_data = fp.read().decode('utf8') yield f, file_data def test_correct_values(self): # Check that corpus views produce the correct sequence of values. for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(list(v), file_data.split()) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(list(v), self.linetok.tokenize(file_data)) def test_correct_length(self): # Check that the corpus views report the correct lengths: for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(len(v), len(file_data.split())) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(len(v), len(self.linetok.tokenize(file_data))) nltk-3.1/nltk/test/unit/test_hmm.py0000644000076500000240000000453712574600335017172 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import, unicode_literals from nltk.tag import hmm def _wikipedia_example_hmm(): # Example from wikipedia # (http://en.wikipedia.org/wiki/Forward%E2%80%93backward_algorithm) states = ['rain', 'no rain'] symbols = ['umbrella', 'no umbrella'] A = [[0.7, 0.3], [0.3, 0.7]] # transition probabilities B = [[0.9, 0.1], [0.2, 0.8]] # emission probabilities pi = [0.5, 0.5] # initial probabilities seq = ['umbrella', 'umbrella', 'no umbrella', 'umbrella', 'umbrella'] seq = list(zip(seq, [None]*len(seq))) model = hmm._create_hmm_tagger(states, symbols, A, B, pi) return model, states, symbols, seq def test_forward_probability(): from numpy.testing import assert_array_almost_equal # example from p. 385, Huang et al model, states, symbols = hmm._market_hmm_example() seq = [('up', None), ('up', None)] expected = [ [0.35, 0.02, 0.09], [0.1792, 0.0085, 0.0357] ] fp = 2**model._forward_probability(seq) assert_array_almost_equal(fp, expected) def test_forward_probability2(): from numpy.testing import assert_array_almost_equal model, states, symbols, seq = _wikipedia_example_hmm() fp = 2**model._forward_probability(seq) # examples in wikipedia are normalized fp = (fp.T / fp.sum(axis=1)).T wikipedia_results = [ [0.8182, 0.1818], [0.8834, 0.1166], [0.1907, 0.8093], [0.7308, 0.2692], [0.8673, 0.1327], ] assert_array_almost_equal(wikipedia_results, fp, 4) def test_backward_probability(): from numpy.testing import assert_array_almost_equal model, states, symbols, seq = _wikipedia_example_hmm() bp = 2**model._backward_probability(seq) # examples in wikipedia are normalized bp = (bp.T / bp.sum(axis=1)).T wikipedia_results = [ # Forward-backward algorithm doesn't need b0_5, # so .backward_probability doesn't compute it. # [0.6469, 0.3531], [0.5923, 0.4077], [0.3763, 0.6237], [0.6533, 0.3467], [0.6273, 0.3727], [0.5, 0.5], ] assert_array_almost_equal(wikipedia_results, bp, 4) def setup_module(module): from nose import SkipTest try: import numpy except ImportError: raise SkipTest("numpy is required for nltk.test.test_hmm") nltk-3.1/nltk/test/unit/test_json2csv_corpus.py0000644000076500000240000001743612607224144021552 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2015 NLTK Project # Author: Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ Regression tests for `json2csv()` and `json2csv_entities()` in Twitter package. """ import os from nltk.compat import TemporaryDirectory import unittest from nltk.corpus import twitter_samples from nltk.twitter.common import json2csv, json2csv_entities from nltk.compat import izip def are_files_identical(filename1, filename2, debug=False): """ Compare two files, ignoring carriage returns. """ with open(filename1, "rb") as fileA: with open(filename2, "rb") as fileB: result = True for lineA, lineB in izip(sorted(fileA.readlines()), sorted(fileB.readlines())): if lineA.strip() != lineB.strip(): if debug: print("Error while comparing files. " + "First difference at line below.") print("=> Output file line: {0}".format(lineA)) print("=> Refer. file line: {0}".format(lineB)) result = False break return result class TestJSON2CSV(unittest.TestCase): def setUp(self): with open(twitter_samples.abspath("tweets.20150430-223406.json")) as infile: self.infile = [next(infile) for x in range(100)] infile.close() self.msg = "Test and reference files are not the same" self.subdir = os.path.join(os.path.dirname(__file__), 'files') def tearDown(self): return def test_textoutput(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.text.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv') json2csv(self.infile, outfn, ['text'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_tweet_metadata(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.tweet.csv.ref') fields = ['created_at', 'favorite_count', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'retweeted', 'text', 'truncated', 'user.id'] with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.tweet.csv') json2csv(self.infile, outfn, fields, gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_user_metadata(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.user.csv.ref') fields = ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count'] with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.user.csv') json2csv(self.infile, outfn, fields, gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_tweet_hashtag(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.hashtag.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.hashtag.csv') json2csv_entities(self.infile, outfn, ['id', 'text'], 'hashtags', ['text'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_tweet_usermention(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.usermention.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.usermention.csv') json2csv_entities(self.infile, outfn, ['id', 'text'], 'user_mentions', ['id', 'screen_name'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_tweet_media(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.media.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.media.csv') json2csv_entities(self.infile, outfn, ['id'], 'media', ['media_url', 'url'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_tweet_url(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.url.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.url.csv') json2csv_entities(self.infile, outfn, ['id'], 'urls', ['url', 'expanded_url'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_userurl(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.userurl.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.userurl.csv') json2csv_entities(self.infile, outfn, ['id', 'screen_name'], 'user.urls', ['url', 'expanded_url'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_tweet_place(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.place.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.place.csv') json2csv_entities(self.infile, outfn, ['id', 'text'], 'place', ['name', 'country'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_tweet_place_boundingbox(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.placeboundingbox.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.placeboundingbox.csv') json2csv_entities(self.infile, outfn, ['id', 'name'], 'place.bounding_box', ['coordinates'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_retweet_original_tweet(self): ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.retweet.csv') json2csv_entities(self.infile, outfn, ['id'], 'retweeted_status', ['created_at', 'favorite_count', 'id', 'in_reply_to_status_id', 'in_reply_to_user_id', 'retweet_count', 'text', 'truncated', 'user.id'], gzip_compress=False) self.assertTrue(are_files_identical(outfn, ref_fn), msg=self.msg) def test_file_is_wrong(self): """ Sanity check that file comparison is not giving false positives. """ ref_fn = os.path.join(self.subdir, 'tweets.20150430-223406.retweet.csv.ref') with TemporaryDirectory() as tempdir: outfn = os.path.join(tempdir, 'tweets.20150430-223406.text.csv') json2csv(self.infile, outfn, ['text'], gzip_compress=False) self.assertFalse(are_files_identical(outfn, ref_fn), msg=self.msg) if __name__ == "__main__": unittest.main() nltk-3.1/nltk/test/unit/test_naivebayes.py0000644000076500000240000000152012574600335020524 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals import unittest from nltk.classify.naivebayes import NaiveBayesClassifier class NaiveBayesClassifierTest(unittest.TestCase): def test_simple(self): training_features = [ ({'nice': True, 'good': True}, 'positive'), ({'bad': True, 'mean': True}, 'negative') ] classifier = NaiveBayesClassifier.train(training_features) result = classifier.prob_classify({'nice': True}) self.assertTrue(result.prob('positive') > result.prob('negative')) self.assertEqual(result.max(), 'positive') result = classifier.prob_classify({'bad': True}) self.assertTrue(result.prob('positive') < result.prob('negative')) self.assertEqual(result.max(), 'negative') nltk-3.1/nltk/test/unit/test_seekable_unicode_stream_reader.py0000644000076500000240000000701012574600335024554 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ The following test performs a random series of reads, seeks, and tells, and checks that the results are consistent. """ from __future__ import absolute_import, unicode_literals import random import functools from io import BytesIO from nltk.corpus.reader import SeekableUnicodeStreamReader def check_reader(unicode_string, encoding, n=1000): bytestr = unicode_string.encode(encoding) strlen = len(unicode_string) stream = BytesIO(bytestr) reader = SeekableUnicodeStreamReader(stream, encoding) # Find all character positions chars = [] while True: pos = reader.tell() chars.append( (pos, reader.read(1)) ) if chars[-1][1] == '': break # Find all strings strings = dict( (pos,'') for (pos,c) in chars ) for pos1, char in chars: for pos2, _ in chars: if pos2 <= pos1: strings[pos2] += char while True: op = random.choice('tsrr') # Check our position? if op == 't': # tell reader.tell() # Perform a seek? if op == 's': # seek new_pos = random.choice([p for (p,c) in chars]) reader.seek(new_pos) # Perform a read? if op == 'r': # read if random.random() < .3: pos = reader.tell() else: pos = None if random.random() < .2: size = None elif random.random() < .8: size = random.randint(0, int(strlen/6)) else: size = random.randint(0, strlen+20) if random.random() < .8: s = reader.read(size) else: s = reader.readline(size) # check that everything's consistent if pos is not None: assert pos in strings assert strings[pos].startswith(s) n -= 1 if n == 0: return 'passed' #Call the randomized test function `check_reader` with a variety of #input strings and encodings. ENCODINGS = ['ascii', 'latin1', 'greek', 'hebrew', 'utf-16', 'utf-8'] STRINGS = [ """ This is a test file. It is fairly short. """, "This file can be encoded with latin1. \x83", """\ This is a test file. Here's a blank line: And here's some unicode: \xee \u0123 \uffe3 """, """\ This is a test file. Unicode characters: \xf3 \u2222 \u3333\u4444 \u5555 """, ] def test_reader(): for string in STRINGS: for encoding in ENCODINGS: try: # skip strings that can't be encoded with the current encoding string.encode(encoding) yield check_reader, string, encoding except UnicodeEncodeError: pass # nose shows the whole string arguments in a verbose mode; this is annoying, # so large string test is separated. LARGE_STRING = """\ This is a larger file. It has some lines that are longer \ than 72 characters. It's got lots of repetition. Here's \ some unicode chars: \xee \u0123 \uffe3 \ueeee \u2345 How fun! Let's repeat it twenty times. """*10 def test_reader_on_large_string(): for encoding in ENCODINGS: try: # skip strings that can't be encoded with the current encoding LARGE_STRING.encode(encoding) def _check(encoding, n=1000): check_reader(LARGE_STRING, encoding, n) yield _check, encoding except UnicodeEncodeError: pass def teardown_module(module=None): import gc gc.collect() nltk-3.1/nltk/test/unit/test_stem.py0000644000076500000240000000237512607224144017354 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals import unittest from nltk.stem.snowball import SnowballStemmer class SnowballTest(unittest.TestCase): def test_russian(self): # Russian words both consisting of Cyrillic # and Roman letters can be stemmed. stemmer_russian = SnowballStemmer("russian") assert stemmer_russian.stem("авантненькаÑ") == "авантненьк" assert stemmer_russian.stem("avenantnen'kai^a") == "avenantnen'k" def test_german(self): stemmer_german = SnowballStemmer("german") stemmer_german2 = SnowballStemmer("german", ignore_stopwords=True) assert stemmer_german.stem("Schr\xe4nke") == 'schrank' assert stemmer_german2.stem("Schr\xe4nke") == 'schrank' assert stemmer_german.stem("keinen") == 'kein' assert stemmer_german2.stem("keinen") == 'keinen' def test_spanish(self): stemmer = SnowballStemmer('spanish') assert stemmer.stem("Visionado") == 'vision' # The word 'algue' was raising an IndexError assert stemmer.stem("algue") == 'algu' def test_short_strings_bug(self): stemmer = SnowballStemmer('english') assert stemmer.stem("y's") == 'y' nltk-3.1/nltk/test/unit/test_tag.py0000644000076500000240000000123712574600335017156 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import, unicode_literals def test_basic(): from nltk.tag import pos_tag from nltk.tokenize import word_tokenize result = pos_tag(word_tokenize("John's big idea isn't all that bad.")) assert result == [('John', 'NNP'), ("'s", 'POS'), ('big', 'JJ'), ('idea', 'NN'), ('is', 'VBZ'), ("n't", 'RB'), ('all', 'DT'), ('that', 'DT'), ('bad', 'JJ'), ('.', '.')] def setup_module(module): from nose import SkipTest try: import numpy except ImportError: raise SkipTest("numpy is required for nltk.test.test_tag") nltk-3.1/nltk/test/unit/test_tgrep.py0000644000076500000240000007412112607224144017523 0ustar sbstaff00000000000000#!/usr/bin/env python # -*- coding: utf-8 -*- # # Natural Language Toolkit: TGrep search # # Copyright (C) 2001-2015 NLTK Project # Author: Will Roberts # URL: # For license information, see LICENSE.TXT ''' Unit tests for nltk.tgrep. ''' from __future__ import absolute_import, print_function, unicode_literals from nltk.compat import b from nltk.tree import ParentedTree from nltk import tgrep import unittest class TestSequenceFunctions(unittest.TestCase): ''' Class containing unit tests for nltk.tgrep. ''' def test_tokenize_simple(self): ''' Simple test of tokenization. ''' tokens = tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]') self.assertEqual(tokens, ['A', '..', '(', 'B', '!', '<', 'C', '.', 'D', ')', '|', '!', '[', '<<', '(', 'E', ',', 'F', ')', '$', 'G', ']']) def test_tokenize_encoding(self): ''' Test that tokenization handles bytes and strs the same way. ''' self.assertEqual( tgrep.tgrep_tokenize(b('A .. (B !< C . D) | ![<< (E , F) $ G]')), tgrep.tgrep_tokenize('A .. (B !< C . D) | ![<< (E , F) $ G]')) def test_tokenize_link_types(self): ''' Test tokenization of basic link types. ''' self.assertEqual(tgrep.tgrep_tokenize('AB'), ['A', '>', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<3B'), ['A', '<3', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A>3B'), ['A', '>3', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<,B'), ['A', '<,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A>,B'), ['A', '>,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<-3B'), ['A', '<-3', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A>-3B'), ['A', '>-3', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<-B'), ['A', '<-', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A>-B'), ['A', '>-', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<\'B'), ['A', '<\'', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A>\'B'), ['A', '>\'', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<:B'), ['A', '<:', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A>:B'), ['A', '>:', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<>B'), ['A', '>>', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<<,B'), ['A', '<<,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A>>,B'), ['A', '>>,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<<\'B'), ['A', '<<\'', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A>>\'B'), ['A', '>>\'', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A<<:B'), ['A', '<<:', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A>>:B'), ['A', '>>:', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A.B'), ['A', '.', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A,B'), ['A', ',', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A..B'), ['A', '..', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A,,B'), ['A', ',,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A$B'), ['A', '$', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A$.B'), ['A', '$.', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A$,B'), ['A', '$,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A$..B'), ['A', '$..', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A$,,B'), ['A', '$,,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!B'), ['A', '!', '>', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<3B'), ['A', '!', '<3', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!>3B'), ['A', '!', '>3', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<,B'), ['A', '!', '<,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!>,B'), ['A', '!', '>,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<-3B'), ['A', '!', '<-3', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!>-3B'), ['A', '!', '>-3', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<-B'), ['A', '!', '<-', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!>-B'), ['A', '!', '>-', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<\'B'), ['A', '!', '<\'', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!>\'B'), ['A', '!', '>\'', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<:B'), ['A', '!', '<:', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!>:B'), ['A', '!', '>:', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<>B'), ['A', '!', '>>', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<<,B'), ['A', '!', '<<,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!>>,B'), ['A', '!', '>>,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<<\'B'), ['A', '!', '<<\'', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!>>\'B'), ['A', '!', '>>\'', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!<<:B'), ['A', '!', '<<:', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!>>:B'), ['A', '!', '>>:', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!.B'), ['A', '!', '.', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!,B'), ['A', '!', ',', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!..B'), ['A', '!', '..', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!,,B'), ['A', '!', ',,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!$B'), ['A', '!', '$', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!$.B'), ['A', '!', '$.', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!$,B'), ['A', '!', '$,', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!$..B'), ['A', '!', '$..', 'B']) self.assertEqual(tgrep.tgrep_tokenize('A!$,,B'), ['A', '!', '$,,', 'B']) def test_tokenize_examples(self): ''' Test tokenization of the TGrep2 manual example patterns. ''' self.assertEqual(tgrep.tgrep_tokenize('NP < PP'), ['NP', '<', 'PP']) self.assertEqual(tgrep.tgrep_tokenize('/^NP/'), ['/^NP/']) self.assertEqual(tgrep.tgrep_tokenize('NP << PP . VP'), ['NP', '<<', 'PP', '.', 'VP']) self.assertEqual(tgrep.tgrep_tokenize('NP << PP | . VP'), ['NP', '<<', 'PP', '|', '.', 'VP']) self.assertEqual(tgrep.tgrep_tokenize('NP !<< PP [> NP | >> VP]'), ['NP', '!', '<<', 'PP', '[', '>', 'NP', '|', '>>', 'VP', ']']) self.assertEqual(tgrep.tgrep_tokenize('NP << (PP . VP)'), ['NP', '<<', '(', 'PP', '.', 'VP', ')']) self.assertEqual(tgrep.tgrep_tokenize('NP <\' (PP <, (IN < on))'), ['NP', '<\'', '(', 'PP', '<,', '(', 'IN', '<', 'on', ')', ')']) self.assertEqual(tgrep.tgrep_tokenize('S < (A < B) < C'), ['S', '<', '(', 'A', '<', 'B', ')', '<', 'C']) self.assertEqual(tgrep.tgrep_tokenize('S < ((A < B) < C)'), ['S', '<', '(', '(', 'A', '<', 'B', ')', '<', 'C', ')']) self.assertEqual(tgrep.tgrep_tokenize('S < (A < B < C)'), ['S', '<', '(', 'A', '<', 'B', '<', 'C', ')']) self.assertEqual(tgrep.tgrep_tokenize('A3B"3B"', '<', 'C']) def test_tokenize_nodenames(self): ''' Test tokenization of node names. ''' self.assertEqual(tgrep.tgrep_tokenize('Robert'), ['Robert']) self.assertEqual(tgrep.tgrep_tokenize('/^[Bb]ob/'), ['/^[Bb]ob/']) self.assertEqual(tgrep.tgrep_tokenize('*'), ['*']) self.assertEqual(tgrep.tgrep_tokenize('__'), ['__']) # test tokenization of NLTK tree position syntax self.assertEqual(tgrep.tgrep_tokenize('N()'), ['N(', ')']) self.assertEqual(tgrep.tgrep_tokenize('N(0,)'), ['N(', '0', ',', ')']) self.assertEqual(tgrep.tgrep_tokenize('N(0,0)'), ['N(', '0', ',', '0', ')']) self.assertEqual(tgrep.tgrep_tokenize('N(0,0,)'), ['N(', '0', ',', '0', ',', ')']) def test_tokenize_macros(self): ''' Test tokenization of macro definitions. ''' self.assertEqual(tgrep.tgrep_tokenize( '@ NP /^NP/;\n@ NN /^NN/;\n@NP [!< NP | < @NN] !$.. @NN'), ['@', 'NP', '/^NP/', ';', '@', 'NN', '/^NN/', ';', '@NP', '[', '!', '<', 'NP', '|', '<', '@NN', ']', '!', '$..', '@NN']) def test_node_simple(self): ''' Test a simple use of tgrep for finding nodes matching a given pattern. ''' tree = ParentedTree.fromstring( '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))') self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0,2), (2,1)]]) self.assertEqual(list(tgrep.tgrep_nodes('NN', [tree])), [[tree[0,2], tree[2,1]]]) self.assertEqual(list(tgrep.tgrep_positions('NN|JJ', [tree])), [[(0, 1), (0, 2), (2, 1)]]) def test_node_printing(self): '''Test that the tgrep print operator ' is properly ignored.''' tree = ParentedTree.fromstring('(S (n x) (N x))') self.assertEqual(list(tgrep.tgrep_positions('N', [tree])), list(tgrep.tgrep_positions('\'N', [tree]))) self.assertEqual(list(tgrep.tgrep_positions('/[Nn]/', [tree])), list(tgrep.tgrep_positions('\'/[Nn]/', [tree]))) def test_node_encoding(self): ''' Test that tgrep search strings handles bytes and strs the same way. ''' tree = ParentedTree.fromstring( '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))') self.assertEqual(list(tgrep.tgrep_positions(b('NN'), [tree])), list(tgrep.tgrep_positions('NN', [tree]))) self.assertEqual(list(tgrep.tgrep_nodes(b('NN'), [tree])), list(tgrep.tgrep_nodes('NN', [tree]))) self.assertEqual(list(tgrep.tgrep_positions(b('NN|JJ'), [tree])), list(tgrep.tgrep_positions('NN|JJ', [tree]))) def test_node_nocase(self): ''' Test selecting nodes using case insensitive node names. ''' tree = ParentedTree.fromstring('(S (n x) (N x))') self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('i@"N"', [tree])), [[(0,), (1,)]]) def test_node_quoted(self): ''' Test selecting nodes using quoted node names. ''' tree = ParentedTree.fromstring('(N ("N" x) (N" x) ("\\" x))') self.assertEqual(list(tgrep.tgrep_positions('"N"', [tree])), [[()]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"N\\""', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('"N\\""', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('"\\"\\\\\\""', [tree])), [[(2,)]]) def test_node_regex(self): ''' Test regex matching on nodes. ''' tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))') # This is a regular expression that matches any node whose # name starts with NP, including NP-SBJ: self.assertEqual(list(tgrep.tgrep_positions('/^NP/', [tree])), [[(0,), (1,)]]) def test_node_regex_2(self): ''' Test regex matching on nodes. ''' tree = ParentedTree.fromstring('(S (SBJ x) (SBJ1 x) (NP-SBJ x))') self.assertEqual(list(tgrep.tgrep_positions('/^SBJ/', [tree])), [[(0,), (1,)]]) # This is a regular expression that matches any node whose # name includes SBJ, including NP-SBJ: self.assertEqual(list(tgrep.tgrep_positions('/SBJ/', [tree])), [[(0,), (1,), (2,)]]) def test_node_tree_position(self): ''' Test matching on nodes based on NLTK tree position. ''' tree = ParentedTree.fromstring('(S (NP-SBJ x) (NP x) (NNP x) (VP x))') # test all tree positions that are not leaves leaf_positions = set([tree.leaf_treeposition(x) for x in range(len(tree.leaves()))]) tree_positions = [x for x in tree.treepositions() if x not in leaf_positions] for position in tree_positions: node_id = 'N{0}'.format(position) tgrep_positions = list(tgrep.tgrep_positions(node_id, [tree])) self.assertEqual(len(tgrep_positions[0]), 1) self.assertEqual(tgrep_positions[0][0], position) def test_node_noleaves(self): ''' Test node name matching with the search_leaves flag set to False. ''' tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))') self.assertEqual(list(tgrep.tgrep_positions('x', [tree])), [[(0, 0, 0), (1, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('x', [tree], False)), [[]]) def tests_rel_dominance(self): ''' Test matching nodes based on dominance relations. ''' tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))') self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* < T > S', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* !< T', [tree])), [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* !< T > S', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('* > A', [tree])), [[(0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* > B', [tree])), [[(1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* !> B', [tree])), [[(), (0,), (0, 0), (0, 0, 0), (1,), (1, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* !> B >> S', [tree])), [[(0,), (0, 0), (1,)]]) self.assertEqual(list(tgrep.tgrep_positions('* >> S', [tree])), [[(0,), (0, 0), (1,), (1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* >>, S', [tree])), [[(0,), (0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* >>\' S', [tree])), [[(1,), (1, 0)]]) # Known issue: #self.assertEqual(list(tgrep.tgrep_positions('* !>> S', [tree])), # [[()]]) self.assertEqual(list(tgrep.tgrep_positions('* << T', [tree])), [[(), (0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* <<\' T', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* <<1 N', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('* !<< T', [tree])), [[(0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0)]]) tree = ParentedTree.fromstring('(S (A (T x)) (B (T x) (N x )))') self.assertEqual(list(tgrep.tgrep_positions('* <: T', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* < T', [tree])), [[(0,), (1,)]]) self.assertEqual(list(tgrep.tgrep_positions('* !<: T', [tree])), [[(), (0, 0), (0, 0, 0), (1,), (1, 0), (1, 0, 0), (1, 1), (1, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* !<: T > S', [tree])), [[(1,)]]) tree = ParentedTree.fromstring('(S (T (A x) (B x)) (T (C x)))') self.assertEqual(list(tgrep.tgrep_positions('* >: T', [tree])), [[(1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* !>: T', [tree])), [[(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0, 0)]]) tree = ParentedTree.fromstring('(S (A (B (C (D (E (T x))))))' ' (A (B (C (D (E (T x))) (N x)))))') self.assertEqual(list(tgrep.tgrep_positions('* <<: T', [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0), (1, 0, 0, 0), (1, 0, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* >>: A', [tree])), [[(0, 0), (0, 0, 0), (0, 0, 0, 0), (0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0), (1, 0), (1, 0, 0)]]) def test_bad_operator(self): ''' Test error handling of undefined tgrep operators. ''' tree = ParentedTree.fromstring('(S (A (T x)) (B (N x)))') self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions('* >>> S', [tree])) def test_comments(self): ''' Test that comments are correctly filtered out of tgrep search strings. ''' tree = ParentedTree.fromstring('(S (NN x) (NP x) (NN x))') search1 = ''' @ NP /^NP/; @ NN /^NN/; @NN ''' self.assertEqual(list(tgrep.tgrep_positions(search1, [tree])), [[(0,), (2,)]]) search2 = ''' # macros @ NP /^NP/; @ NN /^NN/; # search string @NN ''' self.assertEqual(list(tgrep.tgrep_positions(search2, [tree])), [[(0,), (2,)]]) def test_rel_sister_nodes(self): ''' Test matching sister nodes in a tree. ''' tree = ParentedTree.fromstring('(S (A x) (B x) (C x))') self.assertEqual(list(tgrep.tgrep_positions('* $. B', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* $.. B', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* $, B', [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions('* $,, B', [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions('* $ B', [tree])), [[(0,), (2,)]]) def tests_rel_indexed_children(self): ''' Test matching nodes based on their index in their parent node. ''' tree = ParentedTree.fromstring('(S (A x) (B x) (C x))') self.assertEqual(list(tgrep.tgrep_positions('* >, S', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* >1 S', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* >2 S', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('* >3 S', [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions('* >\' S', [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions('* >-1 S', [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions('* >-2 S', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('* >-3 S', [tree])), [[(0,)]]) tree = ParentedTree.fromstring( '(S (D (A x) (B x) (C x)) (E (B x) (C x) (A x)) ' '(F (C x) (A x) (B x)))') self.assertEqual(list(tgrep.tgrep_positions('* <, A', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* <1 A', [tree])), [[(0,)]]) self.assertEqual(list(tgrep.tgrep_positions('* <2 A', [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions('* <3 A', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('* <\' A', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('* <-1 A', [tree])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('* <-2 A', [tree])), [[(2,)]]) self.assertEqual(list(tgrep.tgrep_positions('* <-3 A', [tree])), [[(0,)]]) def test_rel_precedence(self): ''' Test matching nodes based on precedence relations. ''' tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))' ' (VP (AP (X (PP x)) (Y (AP x))))' ' (NP (RC (NP (AP x)))))') self.assertEqual(list(tgrep.tgrep_positions('* . X', [tree])), [[(0,), (0, 1), (0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* . Y', [tree])), [[(1, 0, 0), (1, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* .. X', [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* .. Y', [tree])), [[(0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1, 0, 0), (1, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* , X', [tree])), [[(1, 0, 1), (1, 0, 1, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* , Y', [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* ,, X', [tree])), [[(1, 0, 1), (1, 0, 1, 0), (2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]]) self.assertEqual(list(tgrep.tgrep_positions('* ,, Y', [tree])), [[(2,), (2, 0), (2, 0, 0), (2, 0, 0, 0)]]) def test_examples(self): ''' Test the Basic Examples from the TGrep2 manual. ''' tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)))') # This matches any NP node that immediately dominates a PP: self.assertEqual(list(tgrep.tgrep_positions('NP < PP', [tree])), [[(1,)]]) tree = ParentedTree.fromstring('(S (NP x) (VP x) (NP (PP x)) (VP x))') # This matches an NP that dominates a PP and is immediately # followed by a VP: self.assertEqual(list(tgrep.tgrep_positions('NP << PP . VP', [tree])), [[(2,)]]) tree = ParentedTree.fromstring('(S (NP (AP x)) (NP (PP x)) ' '(NP (DET x) (NN x)) (VP x))') # This matches an NP that dominates a PP or is immediately # followed by a VP: self.assertEqual(list(tgrep.tgrep_positions('NP << PP | . VP', [tree])), [[(1,), (2,)]]) tree = ParentedTree.fromstring('(S (NP (NP (PP x)) (NP (AP x)))' ' (VP (AP (NP (PP x)) (NP (AP x))))' ' (NP (RC (NP (AP x)))))') # This matches an NP that does not dominate a PP. Also, the NP # must either have a parent that is an NP or be dominated by a # VP: self.assertEqual(list(tgrep.tgrep_positions( 'NP !<< PP [> NP | >> VP]', [tree])), [[(0, 1), (1, 0, 1)]]) tree = ParentedTree.fromstring('(S (NP (AP (PP x) (VP x))) ' '(NP (AP (PP x) (NP x))) (NP x))') # This matches an NP that dominates a PP which itself is # immediately followed by a VP. Note the use of parentheses to # group ". VP" with the PP rather than with the NP: self.assertEqual(list(tgrep.tgrep_positions('NP << (PP . VP)', [tree])), [[(0,)]]) tree = ParentedTree.fromstring( '(S (NP (DET a) (NN cat) (PP (IN on) (NP x)))' ' (NP (DET a) (NN cat) (PP (IN on) (NP x)) (PP x))' ' (NP x))') # This matches an NP whose last child is a PP that begins with # the preposition "on": self.assertEqual(list(tgrep.tgrep_positions( 'NP <\' (PP <, (IN < on))', [tree])), [[(0,)]]) tree = ParentedTree.fromstring( '(S (S (C x) (A (B x))) (S (C x) (A x)) ' '(S (D x) (A (B x))))') # The following pattern matches an S which has a child A and # another child that is a C and that the A has a child B: self.assertEqual(list(tgrep.tgrep_positions('S < (A < B) < C', [tree])), [[(0,)]]) tree = ParentedTree.fromstring( '(S (S (A (B x) (C x))) (S (S (C x) (A (B x)))))') # However, this pattern means that S has child A and that A # has children B and C: self.assertEqual(list(tgrep.tgrep_positions('S < ((A < B) < C)', [tree])), [[(0,)]]) # It is equivalent to this: self.assertEqual(list(tgrep.tgrep_positions('S < (A < B < C)', [tree])), [[(0,)]]) def test_use_macros(self): ''' Test defining and using tgrep2 macros. ''' tree = ParentedTree.fromstring( '(VP (VB sold) (NP (DET the) ' '(NN heiress)) (NP (NN deed) (PREP to) ' '(NP (DET the) (NN school) (NN house))))') self.assertEqual(list(tgrep.tgrep_positions( '@ NP /^NP/;\n@ NN /^NN/;\n@NP !< @NP !$.. @NN', [tree])), [[(1,), (2, 2)]]) # use undefined macro @CNP self.assertRaises( tgrep.TgrepException, list, tgrep.tgrep_positions( '@ NP /^NP/;\n@ NN /^NN/;\n@CNP !< @NP !$.. @NN', [tree])) def test_tokenize_node_labels(self): '''Test tokenization of labeled nodes.''' self.assertEqual(tgrep.tgrep_tokenize( 'S < @SBJ < (@VP < (@VB $.. @OBJ))'), ['S', '<', '@SBJ', '<', '(', '@VP', '<', '(', '@VB', '$..', '@OBJ', ')', ')']) self.assertEqual(tgrep.tgrep_tokenize( 'S < @SBJ=s < (@VP=v < (@VB $.. @OBJ))'), ['S', '<', '@SBJ', '=', 's', '<', '(', '@VP', '=', 'v', '<', '(', '@VB', '$..', '@OBJ', ')', ')']) def test_tokenize_segmented_patterns(self): '''Test tokenization of segmented patterns.''' self.assertEqual(tgrep.tgrep_tokenize( 'S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v'), ['S', '<', '@SBJ', '=', 's', '<', '(', '@VP', '=', 'v', '<', '(', '@VB', '$..', '@OBJ', ')', ')', ':', '=s', '..', '=v']) def test_labeled_nodes(self): ''' Test labeled nodes. Test case from Emily M. Bender. ''' search = ''' # macros @ SBJ /SBJ/; @ VP /VP/; @ VB /VB/; @ VPoB /V[PB]/; @ OBJ /OBJ/; # 1 svo S < @SBJ=s < (@VP=v < (@VB $.. @OBJ)) : =s .. =v''' sent1 = ParentedTree.fromstring( '(S (NP-SBJ I) (VP (VB eat) (NP-OBJ (NNS apples))))') sent2 = ParentedTree.fromstring( '(S (VP (VB eat) (NP-OBJ (NNS apples))) (NP-SBJ I))') search_firsthalf = (search.split('\n\n')[0] + 'S < @SBJ < (@VP < (@VB $.. @OBJ))') search_rewrite = 'S < (/.*SBJ/ $.. (/VP/ < (/VB/ $.. /.*OBJ/)))' self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search, [sent1]))[0]) self.assertTrue(list(tgrep.tgrep_positions(search_rewrite, [sent1]))[0]) self.assertEqual(list(tgrep.tgrep_positions(search, [sent1])), list(tgrep.tgrep_positions(search_rewrite, [sent1]))) self.assertTrue(list(tgrep.tgrep_positions(search_firsthalf, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search, [sent2]))[0]) self.assertFalse(list(tgrep.tgrep_positions(search_rewrite, [sent2]))[0]) self.assertEqual(list(tgrep.tgrep_positions(search, [sent2])), list(tgrep.tgrep_positions(search_rewrite, [sent2]))) def test_multiple_conjs(self): ''' Test that multiple (3 or more) conjunctions of node relations are handled properly. ''' sent = ParentedTree.fromstring( '((A (B b) (C c)) (A (B b) (C c) (D d)))') # search = '(A < B < C < D)' # search_tworels = '(A < B < C)' self.assertEqual(list(tgrep.tgrep_positions('(A < B < C < D)', [sent])), [[(1,)]]) self.assertEqual(list(tgrep.tgrep_positions('(A < B < C)', [sent])), [[(0,), (1,)]]) def test_trailing_semicolon(self): ''' Test that semicolons at the end of a tgrep2 search string won't cause a parse failure. ''' tree = ParentedTree.fromstring( '(S (NP (DT the) (JJ big) (NN dog)) ' '(VP bit) (NP (DT a) (NN cat)))') self.assertEqual(list(tgrep.tgrep_positions('NN', [tree])), [[(0,2), (2,1)]]) self.assertEqual(list(tgrep.tgrep_positions('NN;', [tree])), [[(0,2), (2,1)]]) self.assertEqual(list(tgrep.tgrep_positions('NN;;', [tree])), [[(0,2), (2,1)]]) if __name__ == '__main__': unittest.main() nltk-3.1/nltk/test/unit/test_twitter_auth.py0000644000076500000240000001261512607224144021125 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Tests for static parts of Twitter package """ import os import unittest from nltk.twitter import Authenticate class TestCredentials(unittest.TestCase): """ Tests that Twitter credentials information from file is handled correctly. """ def setUp(self): self.subdir = os.path.join(os.path.dirname(__file__), 'files') self.auth = Authenticate() os.environ['TWITTER'] = 'twitter-files' def test_environment(self): """ Test that environment variable has been read correctly. """ fn = os.path.basename(self.auth.creds_subdir) self.assertEqual(fn, os.environ['TWITTER']) def test_empty_subdir1(self): """ Setting subdir to empty path should raise an error. """ try: self.auth.load_creds(subdir='') # raises ValueError (zero length field name in format) for python 2.6 # OSError for the rest except OSError: pass except ValueError: pass except Exception as e: self.fail('Unexpected exception thrown: %s' % e) else: self.fail('OSError exception not thrown.') def test_empty_subdir2(self): """ Setting subdir to `None` should raise an error. """ self.auth.creds_subdir = None try: self.auth.load_creds() except ValueError: pass except Exception as e: self.fail('Unexpected exception thrown: %s' % e) else: self.fail('ValueError exception not thrown.') def test_missingdir(self): """ Setting subdir to nonexistent directory should raise an error. """ try: self.auth.load_creds(subdir='/nosuchdir') # raises ValueError (zero length field name in format) for python 2.6 # OSError for the rest except OSError: pass except ValueError: pass except Exception as e: self.fail('Unexpected exception thrown: %s' % e) else: self.fail('OSError exception not thrown.') def test_missingfile1(self): """ Defaults for authentication will fail since 'credentials.txt' not present in default subdir, as read from `os.environ['TWITTER']`. """ try: self.auth.load_creds() # raises ValueError (zero length field name in format) for python 2.6 # OSError for the rest except OSError: pass except ValueError: pass except Exception as e: self.fail('Unexpected exception thrown: %s' % e) else: self.fail('OSError exception not thrown.') def test_missingfile2(self): """ Credentials file 'foobar' cannot be found in default subdir. """ try: self.auth.load_creds(creds_file='foobar') # raises ValueError (zero length field name in format) for python 2.6 # OSError for the rest except OSError: pass except ValueError: pass except Exception as e: self.fail('Unexpected exception thrown: %s' % e) else: self.fail('OSError exception not thrown.') def test_incomplete_file(self): """ Credentials file 'bad_oauth1-1.txt' is incomplete """ try: self.auth.load_creds(creds_file='bad_oauth1-1.txt', subdir=self.subdir) except ValueError: pass except Exception as e: self.fail('Unexpected exception thrown: %s' % e) else: self.fail('ValueError exception not thrown.') def test_malformed_file1(self): """ First key in credentials file 'bad_oauth1-2.txt' is ill-formed """ try: self.auth.load_creds(creds_file='bad_oauth1-2.txt', subdir=self.subdir) except ValueError: pass except Exception as e: self.fail('Unexpected exception thrown: %s' % e) else: self.fail('ValueError exception not thrown.') def test_malformed_file2(self): """ First key in credentials file 'bad_oauth1-2.txt' is ill-formed """ try: self.auth.load_creds(creds_file='bad_oauth1-3.txt', subdir=self.subdir) except ValueError: pass except Exception as e: self.fail('Unexpected exception thrown: %s' % e) else: self.fail('ValueError exception not thrown.') def test_correct_path(self): """ Path to default credentials file is well-formed, given specified subdir. """ self.auth.load_creds(subdir=self.subdir) self.auth.creds_fullpath = os.path.join(self.subdir, self.auth.creds_file) def test_correct_file1(self): """ Default credentials file is identified """ self.auth.load_creds(subdir=self.subdir) self.assertEqual(self.auth.creds_file, 'credentials.txt') def test_correct_file2(self): """ Default credentials file has been read correctluy """ oauth = self.auth.load_creds(subdir=self.subdir) self.assertEqual(oauth['app_key'], 'a') if __name__ == '__main__': unittest.main() nltk-3.1/nltk/test/unit/translate/0000755000076500000240000000000012610001541016745 5ustar sbstaff00000000000000nltk-3.1/nltk/test/unit/translate/__init__.py0000644000076500000240000000000012607224144021060 0ustar sbstaff00000000000000nltk-3.1/nltk/test/unit/translate/test_bleu.py0000644000076500000240000000641612610000304021310 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Tests for BLEU translation evaluation metric """ import unittest from nltk.translate.bleu_score import _modified_precision class TestBLEU(unittest.TestCase): def test__modified_precision(self): """ Examples from the original BLEU paper http://www.aclweb.org/anthology/P02-1040.pdf """ # Example 1: the "the*" example. # Reference sentences. ref1 = 'the cat is on the mat'.split() ref2 = 'there is a cat on the mat'.split() # Hypothesis sentence(s). hyp1 = 'the the the the the the the'.split() references = [ref1, ref2] # Testing modified unigram precision. hyp1_unigram_precision = _modified_precision(references, hyp1, n=1) assert (round(hyp1_unigram_precision, 4) == 0.2857) # Testing modified bigram precision. assert(_modified_precision(references, hyp1, n=2) == 0.0) # Example 2: the "of the" example. # Reference sentences ref1 = str('It is a guide to action that ensures that the military ' 'will forever heed Party commands').split() ref2 = str('It is the guiding principle which guarantees the military ' 'forces always being under the command of the Party').split() ref3 = str('It is the practical guide for the army always to heed ' 'the directions of the party').split() # Hypothesis sentence(s). hyp1 = 'of the'.split() references = [ref1, ref2, ref3] # Testing modified unigram precision. assert (_modified_precision(references, hyp1, n=1) == 1.0) # Testing modified bigram precision. assert(_modified_precision(references, hyp1, n=2) == 1.0) # Example 3: Proper MT outputs. hyp1 = str('It is a guide to action which ensures that the military ' 'always obeys the commands of the party').split() hyp2 = str('It is to insure the troops forever hearing the activity ' 'guidebook that party direct').split() references = [ref1, ref2, ref3] # Unigram precision. hyp1_unigram_precision = _modified_precision(references, hyp1, n=1) hyp2_unigram_precision = _modified_precision(references, hyp2, n=1) # Test unigram precision without rounding. assert (hyp1_unigram_precision == 0.9444444444444444) assert (hyp2_unigram_precision == 0.5714285714285714) # Test unigram precision with rounding. assert (round(hyp1_unigram_precision, 4) == 0.9444) assert (round(hyp2_unigram_precision, 4) == 0.5714) # Bigram precision hyp1_bigram_precision = _modified_precision(references, hyp1, n=2) hyp2_bigram_precision = _modified_precision(references, hyp2, n=2) # Test bigram precision without rounding. assert (hyp1_bigram_precision == 0.5882352941176471) assert (hyp2_bigram_precision == 0.07692307692307693) # Test bigram precision with rounding. assert (round(hyp1_bigram_precision, 4) == 0.5882) assert (round(hyp2_bigram_precision, 4) == 0.0769) def test_brevity_penalty(self): pass nltk-3.1/nltk/test/unit/translate/test_ibm1.py0000644000076500000240000000530312607224144021223 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Tests for IBM Model 1 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent from nltk.translate import IBMModel from nltk.translate import IBMModel1 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel1(unittest.TestCase): def test_set_uniform_translation_probabilities(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model1 = IBMModel1(corpus, 0) # act model1.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / (target vocab size + 1) self.assertEqual(model1.translation_table['ham']['eier'], 1.0 / 3) self.assertEqual(model1.translation_table['eggs'][None], 1.0 / 3) def test_set_uniform_translation_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model1 = IBMModel1(corpus, 0) # act model1.set_uniform_probabilities(corpus) # assert # examine target words that are not in the training data domain self.assertEqual(model1.translation_table['parrot']['eier'], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, None) translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 model1 = IBMModel1(corpus, 0) model1.translation_table = translation_table # act probability = model1.prob_t_a_given_s(alignment_info) # assert lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 expected_probability = lexical_translation self.assertEqual(round(probability, 4), round(expected_probability, 4)) nltk-3.1/nltk/test/unit/translate/test_ibm2.py0000644000076500000240000000655112607224144021232 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Tests for IBM Model 2 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent from nltk.translate import IBMModel from nltk.translate import IBMModel2 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel2(unittest.TestCase): def test_set_uniform_alignment_probabilities(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model2 = IBMModel2(corpus, 0) # act model2.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / (length of source sentence + 1) self.assertEqual(model2.alignment_table[0][1][3][2], 1.0 / 4) self.assertEqual(model2.alignment_table[2][4][2][4], 1.0 / 3) def test_set_uniform_alignment_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model2 = IBMModel2(corpus, 0) # act model2.set_uniform_probabilities(corpus) # assert # examine i and j values that are not in the training data domain self.assertEqual(model2.alignment_table[99][1][3][2], IBMModel.MIN_PROB) self.assertEqual(model2.alignment_table[2][99][2][4], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, None) translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(float)))) alignment_table[0][3][5][6] = 0.97 # None -> to alignment_table[1][1][5][6] = 0.97 # ich -> i alignment_table[2][4][5][6] = 0.97 # esse -> eat alignment_table[4][2][5][6] = 0.97 # gern -> love alignment_table[5][5][5][6] = 0.96 # räucherschinken -> smoked alignment_table[5][6][5][6] = 0.96 # räucherschinken -> ham model2 = IBMModel2(corpus, 0) model2.translation_table = translation_table model2.alignment_table = alignment_table # act probability = model2.prob_t_a_given_s(alignment_info) # assert lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 alignment = 0.97 * 0.97 * 0.97 * 0.97 * 0.96 * 0.96 expected_probability = lexical_translation * alignment self.assertEqual(round(probability, 4), round(expected_probability, 4)) nltk-3.1/nltk/test/unit/translate/test_ibm3.py0000644000076500000240000001020112607224144021216 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Tests for IBM Model 3 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent from nltk.translate import IBMModel from nltk.translate import IBMModel3 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel3(unittest.TestCase): def test_set_uniform_distortion_probabilities(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model3 = IBMModel3(corpus, 0) # act model3.set_uniform_probabilities(corpus) # assert # expected_prob = 1.0 / length of target sentence self.assertEqual(model3.distortion_table[1][0][3][2], 1.0 / 2) self.assertEqual(model3.distortion_table[4][2][2][4], 1.0 / 4) def test_set_uniform_distortion_probabilities_of_non_domain_values(self): # arrange corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model3 = IBMModel3(corpus, 0) # act model3.set_uniform_probabilities(corpus) # assert # examine i and j values that are not in the training data domain self.assertEqual(model3.distortion_table[0][0][3][2], IBMModel.MIN_PROB) self.assertEqual(model3.distortion_table[9][2][2][4], IBMModel.MIN_PROB) self.assertEqual(model3.distortion_table[2][9][2][4], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, [[3], [1], [4], [], [2], [5, 6]]) distortion_table = defaultdict( lambda: defaultdict(lambda: defaultdict( lambda: defaultdict(float)))) distortion_table[1][1][5][6] = 0.97 # i -> ich distortion_table[2][4][5][6] = 0.97 # love -> gern distortion_table[3][0][5][6] = 0.97 # to -> NULL distortion_table[4][2][5][6] = 0.97 # eat -> esse distortion_table[5][5][5][6] = 0.97 # smoked -> räucherschinken distortion_table[6][5][5][6] = 0.97 # ham -> räucherschinken translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 fertility_table = defaultdict(lambda: defaultdict(float)) fertility_table[1]['ich'] = 0.99 fertility_table[1]['esse'] = 0.99 fertility_table[0]['ja'] = 0.99 fertility_table[1]['gern'] = 0.99 fertility_table[2]['räucherschinken'] = 0.999 fertility_table[1][None] = 0.99 probabilities = { 'p1': 0.167, 'translation_table': translation_table, 'distortion_table': distortion_table, 'fertility_table': fertility_table, 'alignment_table': None } model3 = IBMModel3(corpus, 0, probabilities) # act probability = model3.prob_t_a_given_s(alignment_info) # assert null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) fertility = 1*0.99 * 1*0.99 * 1*0.99 * 1*0.99 * 2*0.999 lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 distortion = 0.97 * 0.97 * 0.97 * 0.97 * 0.97 * 0.97 expected_probability = (null_generation * fertility * lexical_translation * distortion) self.assertEqual(round(probability, 4), round(expected_probability, 4)) nltk-3.1/nltk/test/unit/translate/test_ibm4.py0000644000076500000240000001241212607224144021225 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Tests for IBM Model 4 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent from nltk.translate import IBMModel from nltk.translate import IBMModel4 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel4(unittest.TestCase): def test_set_uniform_distortion_probabilities_of_max_displacements(self): # arrange src_classes = {'schinken': 0, 'eier': 0, 'spam': 1} trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2} corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model4 = IBMModel4(corpus, 0, src_classes, trg_classes) # act model4.set_uniform_probabilities(corpus) # assert # number of displacement values = # 2 *(number of words in longest target sentence - 1) expected_prob = 1.0 / (2 * (4 - 1)) # examine the boundary values for (displacement, src_class, trg_class) self.assertEqual(model4.head_distortion_table[3][0][0], expected_prob) self.assertEqual(model4.head_distortion_table[-3][1][2], expected_prob) self.assertEqual(model4.non_head_distortion_table[3][0], expected_prob) self.assertEqual(model4.non_head_distortion_table[-3][2], expected_prob) def test_set_uniform_distortion_probabilities_of_non_domain_values(self): # arrange src_classes = {'schinken': 0, 'eier': 0, 'spam': 1} trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2} corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model4 = IBMModel4(corpus, 0, src_classes, trg_classes) # act model4.set_uniform_probabilities(corpus) # assert # examine displacement values that are not in the training data domain self.assertEqual(model4.head_distortion_table[4][0][0], IBMModel.MIN_PROB) self.assertEqual(model4.head_distortion_table[100][1][2], IBMModel.MIN_PROB) self.assertEqual(model4.non_head_distortion_table[4][0], IBMModel.MIN_PROB) self.assertEqual(model4.non_head_distortion_table[100][2], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4} trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4} corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, [[3], [1], [4], [], [2], [5, 6]]) head_distortion_table = defaultdict( lambda: defaultdict(lambda: defaultdict(float))) head_distortion_table[1][None][3] = 0.97 # None, i head_distortion_table[3][2][4] = 0.97 # ich, eat head_distortion_table[-2][3][4] = 0.97 # esse, love head_distortion_table[3][4][1] = 0.97 # gern, smoked non_head_distortion_table = defaultdict(lambda: defaultdict(float)) non_head_distortion_table[1][0] = 0.96 # ham translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 fertility_table = defaultdict(lambda: defaultdict(float)) fertility_table[1]['ich'] = 0.99 fertility_table[1]['esse'] = 0.99 fertility_table[0]['ja'] = 0.99 fertility_table[1]['gern'] = 0.99 fertility_table[2]['räucherschinken'] = 0.999 fertility_table[1][None] = 0.99 probabilities = { 'p1': 0.167, 'translation_table': translation_table, 'head_distortion_table': head_distortion_table, 'non_head_distortion_table': non_head_distortion_table, 'fertility_table': fertility_table, 'alignment_table': None } model4 = IBMModel4(corpus, 0, src_classes, trg_classes, probabilities) # act probability = model4.prob_t_a_given_s(alignment_info) # assert null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) fertility = 1*0.99 * 1*0.99 * 1*0.99 * 1*0.99 * 2*0.999 lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 distortion = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96 expected_probability = (null_generation * fertility * lexical_translation * distortion) self.assertEqual(round(probability, 4), round(expected_probability, 4)) nltk-3.1/nltk/test/unit/translate/test_ibm5.py0000644000076500000240000001542712607224144021237 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Tests for IBM Model 5 training methods """ import unittest from collections import defaultdict from nltk.translate import AlignedSent from nltk.translate import IBMModel from nltk.translate import IBMModel4 from nltk.translate import IBMModel5 from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel5(unittest.TestCase): def test_set_uniform_vacancy_probabilities_of_max_displacements(self): # arrange src_classes = {'schinken': 0, 'eier': 0, 'spam': 1} trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2} corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model5 = IBMModel5(corpus, 0, src_classes, trg_classes) # act model5.set_uniform_probabilities(corpus) # assert # number of vacancy difference values = # 2 * number of words in longest target sentence expected_prob = 1.0 / (2 * 4) # examine the boundary values for (dv, max_v, trg_class) self.assertEqual(model5.head_vacancy_table[4][4][0], expected_prob) self.assertEqual(model5.head_vacancy_table[-3][1][2], expected_prob) self.assertEqual(model5.non_head_vacancy_table[4][4][0], expected_prob) self.assertEqual(model5.non_head_vacancy_table[-3][1][2], expected_prob) def test_set_uniform_vacancy_probabilities_of_non_domain_values(self): # arrange src_classes = {'schinken': 0, 'eier': 0, 'spam': 1} trg_classes = {'ham': 0, 'eggs': 1, 'spam': 2} corpus = [ AlignedSent(['ham', 'eggs'], ['schinken', 'schinken', 'eier']), AlignedSent(['spam', 'spam', 'spam', 'spam'], ['spam', 'spam']), ] model5 = IBMModel5(corpus, 0, src_classes, trg_classes) # act model5.set_uniform_probabilities(corpus) # assert # examine dv and max_v values that are not in the training data domain self.assertEqual(model5.head_vacancy_table[5][4][0], IBMModel.MIN_PROB) self.assertEqual(model5.head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) self.assertEqual(model5.head_vacancy_table[4][0][0], IBMModel.MIN_PROB) self.assertEqual(model5.non_head_vacancy_table[5][4][0], IBMModel.MIN_PROB) self.assertEqual(model5.non_head_vacancy_table[-4][1][2], IBMModel.MIN_PROB) def test_prob_t_a_given_s(self): # arrange src_sentence = ["ich", 'esse', 'ja', 'gern', 'räucherschinken'] trg_sentence = ['i', 'love', 'to', 'eat', 'smoked', 'ham'] src_classes = {'räucherschinken': 0, 'ja': 1, 'ich': 2, 'esse': 3, 'gern': 4} trg_classes = {'ham': 0, 'smoked': 1, 'i': 3, 'love': 4, 'to': 2, 'eat': 4} corpus = [AlignedSent(trg_sentence, src_sentence)] alignment_info = AlignmentInfo((0, 1, 4, 0, 2, 5, 5), [None] + src_sentence, ['UNUSED'] + trg_sentence, [[3], [1], [4], [], [2], [5, 6]]) head_vacancy_table = defaultdict( lambda: defaultdict(lambda: defaultdict(float))) head_vacancy_table[1 - 0][6][3] = 0.97 # ich -> i head_vacancy_table[3 - 0][5][4] = 0.97 # esse -> eat head_vacancy_table[1 - 2][4][4] = 0.97 # gern -> love head_vacancy_table[2 - 0][2][1] = 0.97 # räucherschinken -> smoked non_head_vacancy_table = defaultdict( lambda: defaultdict(lambda: defaultdict(float))) non_head_vacancy_table[1 - 0][1][0] = 0.96 # räucherschinken -> ham translation_table = defaultdict(lambda: defaultdict(float)) translation_table['i']['ich'] = 0.98 translation_table['love']['gern'] = 0.98 translation_table['to'][None] = 0.98 translation_table['eat']['esse'] = 0.98 translation_table['smoked']['räucherschinken'] = 0.98 translation_table['ham']['räucherschinken'] = 0.98 fertility_table = defaultdict(lambda: defaultdict(float)) fertility_table[1]['ich'] = 0.99 fertility_table[1]['esse'] = 0.99 fertility_table[0]['ja'] = 0.99 fertility_table[1]['gern'] = 0.99 fertility_table[2]['räucherschinken'] = 0.999 fertility_table[1][None] = 0.99 probabilities = { 'p1': 0.167, 'translation_table': translation_table, 'fertility_table': fertility_table, 'head_vacancy_table': head_vacancy_table, 'non_head_vacancy_table': non_head_vacancy_table, 'head_distortion_table': None, 'non_head_distortion_table': None, 'alignment_table': None } model5 = IBMModel5(corpus, 0, src_classes, trg_classes, probabilities) # act probability = model5.prob_t_a_given_s(alignment_info) # assert null_generation = 5 * pow(0.167, 1) * pow(0.833, 4) fertility = 1*0.99 * 1*0.99 * 1*0.99 * 1*0.99 * 2*0.999 lexical_translation = 0.98 * 0.98 * 0.98 * 0.98 * 0.98 * 0.98 vacancy = 0.97 * 0.97 * 1 * 0.97 * 0.97 * 0.96 expected_probability = (null_generation * fertility * lexical_translation * vacancy) self.assertEqual(round(probability, 4), round(expected_probability, 4)) def test_prune(self): # arrange alignment_infos = [ AlignmentInfo((1, 1), None, None, None), AlignmentInfo((1, 2), None, None, None), AlignmentInfo((2, 1), None, None, None), AlignmentInfo((2, 2), None, None, None), AlignmentInfo((0, 0), None, None, None) ] min_factor = IBMModel5.MIN_SCORE_FACTOR best_score = 0.9 scores = { (1, 1): min(min_factor * 1.5, 1) * best_score, # above threshold (1, 2): best_score, (2, 1): min_factor * best_score, # at threshold (2, 2): min_factor * best_score * 0.5, # low score (0, 0): min(min_factor * 1.1, 1) * 1.2 # above threshold } corpus = [AlignedSent(['a'], ['b'])] original_prob_function = IBMModel4.model4_prob_t_a_given_s # mock static method IBMModel4.model4_prob_t_a_given_s = staticmethod( lambda a, model: scores[a.alignment]) model5 = IBMModel5(corpus, 0, None, None) # act pruned_alignments = model5.prune(alignment_infos) # assert self.assertEqual(len(pruned_alignments), 3) # restore static method IBMModel4.model4_prob_t_a_given_s = original_prob_function nltk-3.1/nltk/test/unit/translate/test_ibm_model.py0000644000076500000240000002250212607224144022322 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- """ Tests for common methods of IBM translation models """ import unittest from collections import defaultdict from nltk.translate import AlignedSent from nltk.translate import IBMModel from nltk.translate.ibm_model import AlignmentInfo class TestIBMModel(unittest.TestCase): __TEST_SRC_SENTENCE = ["j'", 'aime', 'bien', 'jambon'] __TEST_TRG_SENTENCE = ['i', 'love', 'ham'] def test_vocabularies_are_initialized(self): parallel_corpora = [ AlignedSent(['one', 'two', 'three', 'four'], ['un', 'deux', 'trois']), AlignedSent(['five', 'one', 'six'], ['quatre', 'cinq', 'six']), AlignedSent([], ['sept']) ] ibm_model = IBMModel(parallel_corpora) self.assertEqual(len(ibm_model.src_vocab), 8) self.assertEqual(len(ibm_model.trg_vocab), 6) def test_vocabularies_are_initialized_even_with_empty_corpora(self): parallel_corpora = [] ibm_model = IBMModel(parallel_corpora) self.assertEqual(len(ibm_model.src_vocab), 1) # addition of NULL token self.assertEqual(len(ibm_model.trg_vocab), 0) def test_best_model2_alignment(self): # arrange sentence_pair = AlignedSent( TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE) # None and 'bien' have zero fertility translation_table = { 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0}, 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03}, 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0} } alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: 0.2)))) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (1, 2, 4)) # 0th element unused self.assertEqual(a_info.cepts, [[], [1], [2], [], [3]]) def test_best_model2_alignment_does_not_change_pegged_alignment(self): # arrange sentence_pair = AlignedSent( TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE) translation_table = { 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0}, 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03}, 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0} } alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: 0.2)))) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act: force 'love' to be pegged to 'jambon' a_info = ibm_model.best_model2_alignment(sentence_pair, 2, 4) # assert self.assertEqual(a_info.alignment[1:], (1, 4, 4)) self.assertEqual(a_info.cepts, [[], [1], [], [], [2, 3]]) def test_best_model2_alignment_handles_fertile_words(self): # arrange sentence_pair = AlignedSent( ['i', 'really', ',', 'really', 'love', 'ham'], TestIBMModel.__TEST_SRC_SENTENCE) # 'bien' produces 2 target words: 'really' and another 'really' translation_table = { 'i': {"j'": 0.9, 'aime': 0.05, 'bien': 0.02, 'jambon': 0.03, None: 0}, 'really': {"j'": 0, 'aime': 0, 'bien': 0.9, 'jambon': 0.01, None: 0.09}, ',': {"j'": 0, 'aime': 0, 'bien': 0.3, 'jambon': 0, None: 0.7}, 'love': {"j'": 0.05, 'aime': 0.9, 'bien': 0.01, 'jambon': 0.01, None: 0.03}, 'ham': {"j'": 0, 'aime': 0.01, 'bien': 0, 'jambon': 0.99, None: 0} } alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: 0.2)))) ibm_model = IBMModel([]) ibm_model.translation_table = translation_table ibm_model.alignment_table = alignment_table # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (1, 3, 0, 3, 2, 4)) self.assertEqual(a_info.cepts, [[3], [1], [5], [2, 4], [6]]) def test_best_model2_alignment_handles_empty_src_sentence(self): # arrange sentence_pair = AlignedSent(TestIBMModel.__TEST_TRG_SENTENCE, []) ibm_model = IBMModel([]) # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], (0, 0, 0)) self.assertEqual(a_info.cepts, [[1, 2, 3]]) def test_best_model2_alignment_handles_empty_trg_sentence(self): # arrange sentence_pair = AlignedSent([], TestIBMModel.__TEST_SRC_SENTENCE) ibm_model = IBMModel([]) # act a_info = ibm_model.best_model2_alignment(sentence_pair) # assert self.assertEqual(a_info.alignment[1:], ()) self.assertEqual(a_info.cepts, [[], [], [], [], []]) def test_neighboring_finds_neighbor_alignments(self): # arrange a_info = AlignmentInfo( (0, 3, 2), (None, 'des', 'Å“ufs', 'verts'), ('UNUSED', 'green', 'eggs'), [[], [], [2], [1]] ) ibm_model = IBMModel([]) # act neighbors = ibm_model.neighboring(a_info) # assert neighbor_alignments = set() for neighbor in neighbors: neighbor_alignments.add(neighbor.alignment) expected_alignments = set([ # moves (0, 0, 2), (0, 1, 2), (0, 2, 2), (0, 3, 0), (0, 3, 1), (0, 3, 3), # swaps (0, 2, 3), # original alignment (0, 3, 2) ]) self.assertEqual(neighbor_alignments, expected_alignments) def test_neighboring_sets_neighbor_alignment_info(self): # arrange a_info = AlignmentInfo( (0, 3, 2), (None, 'des', 'Å“ufs', 'verts'), ('UNUSED', 'green', 'eggs'), [[], [], [2], [1]] ) ibm_model = IBMModel([]) # act neighbors = ibm_model.neighboring(a_info) # assert: select a few particular alignments for neighbor in neighbors: if neighbor.alignment == (0, 2, 2): moved_alignment = neighbor elif neighbor.alignment == (0, 3, 2): swapped_alignment = neighbor self.assertEqual(moved_alignment.cepts, [[], [], [1, 2], []]) self.assertEqual(swapped_alignment.cepts, [[], [], [2], [1]]) def test_neighboring_returns_neighbors_with_pegged_alignment(self): # arrange a_info = AlignmentInfo( (0, 3, 2), (None, 'des', 'Å“ufs', 'verts'), ('UNUSED', 'green', 'eggs'), [[], [], [2], [1]] ) ibm_model = IBMModel([]) # act: peg 'eggs' to align with 'Å“ufs' neighbors = ibm_model.neighboring(a_info, 2) # assert neighbor_alignments = set() for neighbor in neighbors: neighbor_alignments.add(neighbor.alignment) expected_alignments = set([ # moves (0, 0, 2), (0, 1, 2), (0, 2, 2), # no swaps # original alignment (0, 3, 2) ]) self.assertEqual(neighbor_alignments, expected_alignments) def test_hillclimb(self): # arrange initial_alignment = AlignmentInfo((0, 3, 2), None, None, None) def neighboring_mock(a, j): if a.alignment == (0, 3, 2): return set([ AlignmentInfo((0, 2, 2), None, None, None), AlignmentInfo((0, 1, 1), None, None, None) ]) elif a.alignment == (0, 2, 2): return set([ AlignmentInfo((0, 3, 3), None, None, None), AlignmentInfo((0, 4, 4), None, None, None) ]) return set() def prob_t_a_given_s_mock(a): prob_values = { (0, 3, 2): 0.5, (0, 2, 2): 0.6, (0, 1, 1): 0.4, (0, 3, 3): 0.6, (0, 4, 4): 0.7 } return prob_values.get(a.alignment, 0.01) ibm_model = IBMModel([]) ibm_model.neighboring = neighboring_mock ibm_model.prob_t_a_given_s = prob_t_a_given_s_mock # act best_alignment = ibm_model.hillclimb(initial_alignment) # assert: hill climbing goes from (0, 3, 2) -> (0, 2, 2) -> (0, 4, 4) self.assertEqual(best_alignment.alignment, (0, 4, 4)) def test_sample(self): # arrange sentence_pair = AlignedSent( TestIBMModel.__TEST_TRG_SENTENCE, TestIBMModel.__TEST_SRC_SENTENCE) ibm_model = IBMModel([]) ibm_model.prob_t_a_given_s = lambda x: 0.001 # act samples, best_alignment = ibm_model.sample(sentence_pair) # assert self.assertEqual(len(samples), 61) nltk-3.1/nltk/test/unit/translate/test_stack_decoder.py0000644000076500000240000002310512607224144023165 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Stack decoder # # Copyright (C) 2001-2015 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Tests for stack decoder """ import unittest from collections import defaultdict from math import log from nltk.translate import PhraseTable from nltk.translate import StackDecoder from nltk.translate.stack_decoder import _Hypothesis, _Stack class TestStackDecoder(unittest.TestCase): def test_find_all_src_phrases(self): # arrange phrase_table = TestStackDecoder.create_fake_phrase_table() stack_decoder = StackDecoder(phrase_table, None) sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels') # act src_phrase_spans = stack_decoder.find_all_src_phrases(sentence) # assert self.assertEqual(src_phrase_spans[0], [2]) # 'my hovercraft' self.assertEqual(src_phrase_spans[1], [2]) # 'hovercraft' self.assertEqual(src_phrase_spans[2], [3]) # 'is' self.assertEqual(src_phrase_spans[3], [5, 6]) # 'full of', 'full of eels' self.assertFalse(src_phrase_spans[4]) # no entry starting with 'of' self.assertEqual(src_phrase_spans[5], [6]) # 'eels' def test_distortion_score(self): # arrange stack_decoder = StackDecoder(None, None) stack_decoder.distortion_factor = 0.5 hypothesis = _Hypothesis() hypothesis.src_phrase_span = (3, 5) # act score = stack_decoder.distortion_score(hypothesis, (8, 10)) # assert expected_score = log(stack_decoder.distortion_factor) * (8 - 5) self.assertEqual(score, expected_score) def test_distortion_score_of_first_expansion(self): # arrange stack_decoder = StackDecoder(None, None) stack_decoder.distortion_factor = 0.5 hypothesis = _Hypothesis() # act score = stack_decoder.distortion_score(hypothesis, (8, 10)) # assert # expansion from empty hypothesis always has zero distortion cost self.assertEqual(score, 0.0) def test_compute_future_costs(self): # arrange phrase_table = TestStackDecoder.create_fake_phrase_table() language_model = TestStackDecoder.create_fake_language_model() stack_decoder = StackDecoder(phrase_table, language_model) sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels') # act future_scores = stack_decoder.compute_future_scores(sentence) # assert self.assertEqual( future_scores[1][2], (phrase_table.translations_for(('hovercraft',))[0].log_prob + language_model.probability(('hovercraft',)))) self.assertEqual( future_scores[0][2], (phrase_table.translations_for(('my', 'hovercraft'))[0].log_prob + language_model.probability(('my', 'hovercraft')))) def test_compute_future_costs_for_phrases_not_in_phrase_table(self): # arrange phrase_table = TestStackDecoder.create_fake_phrase_table() language_model = TestStackDecoder.create_fake_language_model() stack_decoder = StackDecoder(phrase_table, language_model) sentence = ('my', 'hovercraft', 'is', 'full', 'of', 'eels') # act future_scores = stack_decoder.compute_future_scores(sentence) # assert self.assertEqual( future_scores[1][3], # 'hovercraft is' is not in phrase table future_scores[1][2] + future_scores[2][3]) # backoff def test_future_score(self): # arrange: sentence with 8 words; words 2, 3, 4 already translated hypothesis = _Hypothesis() hypothesis.untranslated_spans = lambda _: [(0, 2), (5, 8)] # mock future_score_table = defaultdict(lambda: defaultdict(float)) future_score_table[0][2] = 0.4 future_score_table[5][8] = 0.5 stack_decoder = StackDecoder(None, None) # act future_score = stack_decoder.future_score( hypothesis, future_score_table, 8) # assert self.assertEqual(future_score, 0.4 + 0.5) def test_valid_phrases(self): # arrange hypothesis = _Hypothesis() # mock untranslated_spans method hypothesis.untranslated_spans = lambda _: [ (0, 2), (3, 6) ] all_phrases_from = [ [1, 4], [2], [], [5], [5, 6, 7], [], [7] ] # act phrase_spans = StackDecoder.valid_phrases(all_phrases_from, hypothesis) # assert self.assertEqual(phrase_spans, [(0, 1), (1, 2), (3, 5), (4, 5), (4, 6)]) @staticmethod def create_fake_phrase_table(): phrase_table = PhraseTable() phrase_table.add(('hovercraft',), ('',), 0.8) phrase_table.add(('my', 'hovercraft'), ('', ''), 0.7) phrase_table.add(('my', 'cheese'), ('', ''), 0.7) phrase_table.add(('is',), ('',), 0.8) phrase_table.add(('is',), ('',), 0.5) phrase_table.add(('full', 'of'), ('', ''), 0.01) phrase_table.add(('full', 'of', 'eels'), ('', '', ''), 0.5) phrase_table.add(('full', 'of', 'spam'), ('', ''), 0.5) phrase_table.add(('eels',), ('',), 0.5) phrase_table.add(('spam',), ('',), 0.5) return phrase_table @staticmethod def create_fake_language_model(): # nltk.model should be used here once it is implemented language_prob = defaultdict(lambda: -999.0) language_prob[('my',)] = log(0.1) language_prob[('hovercraft',)] = log(0.1) language_prob[('is',)] = log(0.1) language_prob[('full',)] = log(0.1) language_prob[('of',)] = log(0.1) language_prob[('eels',)] = log(0.1) language_prob[('my', 'hovercraft',)] = log(0.3) language_model = type( '', (object,), {'probability': lambda _, phrase: language_prob[phrase]})() return language_model class TestHypothesis(unittest.TestCase): def setUp(self): root = _Hypothesis() child = _Hypothesis( raw_score=0.5, src_phrase_span=(3, 7), trg_phrase=('hello', 'world'), previous=root ) grandchild = _Hypothesis( raw_score=0.4, src_phrase_span=(1, 2), trg_phrase=('and', 'goodbye'), previous=child ) self.hypothesis_chain = grandchild def test_translation_so_far(self): # act translation = self.hypothesis_chain.translation_so_far() # assert self.assertEqual(translation, ['hello', 'world', 'and', 'goodbye']) def test_translation_so_far_for_empty_hypothesis(self): # arrange hypothesis = _Hypothesis() # act translation = hypothesis.translation_so_far() # assert self.assertEqual(translation, []) def test_total_translated_words(self): # act total_translated_words = self.hypothesis_chain.total_translated_words() # assert self.assertEqual(total_translated_words, 5) def test_translated_positions(self): # act translated_positions = self.hypothesis_chain.translated_positions() # assert translated_positions.sort() self.assertEqual(translated_positions, [1, 3, 4, 5, 6]) def test_untranslated_spans(self): # act untranslated_spans = self.hypothesis_chain.untranslated_spans(10) # assert self.assertEqual(untranslated_spans, [(0, 1), (2, 3), (7, 10)]) def test_untranslated_spans_for_empty_hypothesis(self): # arrange hypothesis = _Hypothesis() # act untranslated_spans = hypothesis.untranslated_spans(10) # assert self.assertEqual(untranslated_spans, [(0, 10)]) class TestStack(unittest.TestCase): def test_push_bumps_off_worst_hypothesis_when_stack_is_full(self): # arrange stack = _Stack(3) poor_hypothesis = _Hypothesis(0.01) # act stack.push(_Hypothesis(0.2)) stack.push(poor_hypothesis) stack.push(_Hypothesis(0.1)) stack.push(_Hypothesis(0.3)) # assert self.assertFalse(poor_hypothesis in stack) def test_push_removes_hypotheses_that_fall_below_beam_threshold(self): # arrange stack = _Stack(3, 0.5) poor_hypothesis = _Hypothesis(0.01) worse_hypothesis = _Hypothesis(0.009) # act stack.push(poor_hypothesis) stack.push(worse_hypothesis) stack.push(_Hypothesis(0.9)) # greatly superior hypothesis # assert self.assertFalse(poor_hypothesis in stack) self.assertFalse(worse_hypothesis in stack) def test_push_does_not_add_hypothesis_that_falls_below_beam_threshold(self): # arrange stack = _Stack(3, 0.5) poor_hypothesis = _Hypothesis(0.01) # act stack.push(_Hypothesis(0.9)) # greatly superior hypothesis stack.push(poor_hypothesis) # assert self.assertFalse(poor_hypothesis in stack) def test_best_returns_the_best_hypothesis(self): # arrange stack = _Stack(3) best_hypothesis = _Hypothesis(0.99) # act stack.push(_Hypothesis(0.0)) stack.push(best_hypothesis) stack.push(_Hypothesis(0.5)) # assert self.assertEqual(stack.best(), best_hypothesis) def test_best_returns_none_when_stack_is_empty(self): # arrange stack = _Stack(3) # assert self.assertEqual(stack.best(), None) nltk-3.1/nltk/test/unit/utils.py0000644000076500000240000000252112574600335016501 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import from unittest import TestCase from functools import wraps from nose.plugins.skip import SkipTest from nltk.util import py26 def skip(reason): """ Unconditionally skip a test. """ def decorator(test_item): is_test_class = isinstance(test_item, type) and issubclass(test_item, TestCase) if is_test_class and py26(): # Patch all test_ methods to raise SkipText exception. # This is necessary for Python 2.6 because its unittest # doesn't understand __unittest_skip__. for meth_name in (m for m in dir(test_item) if m.startswith('test_')): patched_method = skip(reason)(getattr(test_item, meth_name)) setattr(test_item, meth_name, patched_method) if not is_test_class: @wraps(test_item) def skip_wrapper(*args, **kwargs): raise SkipTest(reason) skip_wrapper.__name__ = test_item.__name__ test_item = skip_wrapper test_item.__unittest_skip__ = True test_item.__unittest_skip_why__ = reason return test_item return decorator def skipIf(condition, reason): """ Skip a test if the condition is true. """ if condition: return skip(reason) return lambda obj: objnltk-3.1/nltk/test/util.doctest0000644000076500000240000000204112607224144016346 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ================= Utility functions ================= >>> from __future__ import print_function >>> from nltk.util import * >>> from nltk.tree import Tree >>> print_string("This is a long string, therefore it should break", 25) This is a long string, therefore it should break >>> re_show("[a-z]+", "sdf123") {sdf}123 >>> tree = Tree(5, ... [Tree(4, [Tree(2, [1, 3])]), ... Tree(8, [Tree(6, [7]), 9])]) >>> for x in breadth_first(tree): ... if isinstance(x, int): print(x) ... else: print(x.label()) 5 4 8 2 6 9 1 3 7 >>> for x in breadth_first(tree, maxdepth=2): ... if isinstance(x, int): print(x) ... else: print(x.label()) 5 4 8 2 6 9 >>> invert_dict({1: 2}) defaultdict(<... 'list'>, {2: 1}) >>> invert_dict({1: [3, 4, 5]}) defaultdict(<... 'list'>, {3: [1], 4: [1], 5: [1]}) nltk-3.1/nltk/test/wordnet.doctest0000644000076500000240000004757112607247357017107 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT ================= WordNet Interface ================= WordNet is just another NLTK corpus reader, and can be imported like this: >>> from __future__ import print_function, unicode_literals >>> from nltk.corpus import wordnet For more compact code, we recommend: >>> from nltk.corpus import wordnet as wn ----- Words ----- Look up a word using ``synsets()``; this function has an optional ``pos`` argument which lets you constrain the part of speech of the word: >>> wn.synsets('dog') # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')] >>> wn.synsets('dog', pos=wn.VERB) [Synset('chase.v.01')] The other parts of speech are ``NOUN``, ``ADJ`` and ``ADV``. A synset is identified with a 3-part name of the form: word.pos.nn: >>> wn.synset('dog.n.01') Synset('dog.n.01') >>> print(wn.synset('dog.n.01').definition()) a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds >>> len(wn.synset('dog.n.01').examples()) 1 >>> print(wn.synset('dog.n.01').examples()[0]) the dog barked all night >>> wn.synset('dog.n.01').lemmas() [Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')] >>> [str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()] ['dog', 'domestic_dog', 'Canis_familiaris'] >>> wn.lemma('dog.n.01.dog').synset() Synset('dog.n.01') The WordNet corpus reader gives access to the Open Multilingual WordNet, using ISO-639 language codes. >>> sorted(wn.langs()) # doctest: +NORMALIZE_WHITESPACE ['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus', 'fas', 'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'ita', 'jpn', 'nno', 'nob', 'pol', 'por', 'qcn', 'slv', 'spa', 'swe', 'tha', 'zsm'] >>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn') [Synset('dog.n.01'), Synset('spy.n.01')] wn.synset('spy.n.01').lemma_names('jpn') # doctest: +NORMALIZE_WHITESPACE ['\u3044\u306c', '\u307e\u308f\u3057\u8005', '\u30b9\u30d1\u30a4', '\u56de\u3057\u8005', '\u56de\u8005', '\u5bc6\u5075', '\u5de5\u4f5c\u54e1', '\u5efb\u3057\u8005', '\u5efb\u8005', '\u63a2', '\u63a2\u308a', '\u72ac', '\u79d8\u5bc6\u635c\u67fb\u54e1', '\u8adc\u5831\u54e1', '\u8adc\u8005', '\u9593\u8005', '\u9593\u8adc', '\u96a0\u5bc6'] >>> wn.synset('dog.n.01').lemma_names('ita') ['cane', 'Canis_familiaris'] >>> wn.lemmas('cane', lang='ita') # doctest: +NORMALIZE_WHITESPACE [Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'), Lemma('incompetent.n.01.cane')] >>> sorted(wn.synset('dog.n.01').lemmas('dan')) # doctest: +NORMALIZE_WHITESPACE [Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'), Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')] sorted(wn.synset('dog.n.01').lemmas('por')) [Lemma('dog.n.01.cachorra'), Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.cadela'), Lemma('dog.n.01.c\xe3o')] >>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por') >>> dog_lemma Lemma('dog.n.01.c\xe3o') >>> dog_lemma.lang() 'por' >>> len(wordnet.all_lemma_names(pos='n', lang='jpn')) 64797 ------- Synsets ------- `Synset`: a set of synonyms that share a common meaning. >>> dog = wn.synset('dog.n.01') >>> dog.hypernyms() [Synset('canine.n.02'), Synset('domestic_animal.n.01')] >>> dog.hyponyms() # doctest: +ELLIPSIS [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), ...] >>> dog.member_holonyms() [Synset('canis.n.01'), Synset('pack.n.06')] >>> dog.root_hypernyms() [Synset('entity.n.01')] >>> wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01')) [Synset('carnivore.n.01')] Each synset contains one or more lemmas, which represent a specific sense of a specific word. Note that some relations are defined by WordNet only over Lemmas: >>> good = wn.synset('good.a.01') >>> good.antonyms() Traceback (most recent call last): File "", line 1, in AttributeError: 'Synset' object has no attribute 'antonyms' >>> good.lemmas()[0].antonyms() [Lemma('bad.a.01.bad')] The relations that are currently defined in this way are `antonyms`, `derivationally_related_forms` and `pertainyms`. ------ Lemmas ------ >>> eat = wn.lemma('eat.v.03.eat') >>> eat Lemma('feed.v.06.eat') >>> print(eat.key()) eat%2:34:02:: >>> eat.count() 4 >>> wn.lemma_from_key(eat.key()) Lemma('feed.v.06.eat') >>> wn.lemma_from_key(eat.key()).synset() Synset('feed.v.06') >>> wn.lemma_from_key('feebleminded%5:00:00:retarded:00') Lemma('backward.s.03.feebleminded') >>> for lemma in wn.synset('eat.v.03').lemmas(): ... print(lemma, lemma.count()) ... Lemma('feed.v.06.feed') 3 Lemma('feed.v.06.eat') 4 >>> for lemma in wn.lemmas('eat', 'v'): ... print(lemma, lemma.count()) ... Lemma('eat.v.01.eat') 61 Lemma('eat.v.02.eat') 13 Lemma('feed.v.06.eat') 4 Lemma('eat.v.04.eat') 0 Lemma('consume.v.05.eat') 0 Lemma('corrode.v.01.eat') 0 Lemmas can also have relations between them: >>> vocal = wn.lemma('vocal.a.01.vocal') >>> vocal.derivationally_related_forms() [Lemma('vocalize.v.02.vocalize')] >>> vocal.pertainyms() [Lemma('voice.n.02.voice')] >>> vocal.antonyms() [Lemma('instrumental.a.01.instrumental')] The three relations above exist only on lemmas, not on synsets. ----------- Verb Frames ----------- >>> wn.synset('think.v.01').frame_ids() [5, 9] >>> for lemma in wn.synset('think.v.01').lemmas(): ... print(lemma, lemma.frame_ids()) ... print(" | ".join(lemma.frame_strings())) ... Lemma('think.v.01.think') [5, 9] Something think something Adjective/Noun | Somebody think somebody Lemma('think.v.01.believe') [5, 9] Something believe something Adjective/Noun | Somebody believe somebody Lemma('think.v.01.consider') [5, 9] Something consider something Adjective/Noun | Somebody consider somebody Lemma('think.v.01.conceive') [5, 9] Something conceive something Adjective/Noun | Somebody conceive somebody >>> wn.synset('stretch.v.02').frame_ids() [8] >>> for lemma in wn.synset('stretch.v.02').lemmas(): ... print(lemma, lemma.frame_ids()) ... print(" | ".join(lemma.frame_strings())) ... Lemma('stretch.v.02.stretch') [8, 2] Somebody stretch something | Somebody stretch Lemma('stretch.v.02.extend') [8] Somebody extend something ---------- Similarity ---------- >>> dog = wn.synset('dog.n.01') >>> cat = wn.synset('cat.n.01') >>> hit = wn.synset('hit.v.01') >>> slap = wn.synset('slap.v.01') ``synset1.path_similarity(synset2):`` Return a score denoting how similar two word senses are, based on the shortest path that connects the senses in the is-a (hypernym/hypnoym) taxonomy. The score is in the range 0 to 1. By default, there is now a fake root node added to verbs so for cases where previously a path could not be found---and None was returned---it should return a value. The old behavior can be achieved by setting simulate_root to be False. A score of 1 represents identity i.e. comparing a sense with itself will return 1. >>> dog.path_similarity(cat) # doctest: +ELLIPSIS 0.2... >>> hit.path_similarity(slap) # doctest: +ELLIPSIS 0.142... >>> wn.path_similarity(hit, slap) # doctest: +ELLIPSIS 0.142... >>> print(hit.path_similarity(slap, simulate_root=False)) None >>> print(wn.path_similarity(hit, slap, simulate_root=False)) None ``synset1.lch_similarity(synset2):`` Leacock-Chodorow Similarity: Return a score denoting how similar two word senses are, based on the shortest path that connects the senses (as above) and the maximum depth of the taxonomy in which the senses occur. The relationship is given as -log(p/2d) where p is the shortest path length and d the taxonomy depth. >>> dog.lch_similarity(cat) # doctest: +ELLIPSIS 2.028... >>> hit.lch_similarity(slap) # doctest: +ELLIPSIS 1.312... >>> wn.lch_similarity(hit, slap) # doctest: +ELLIPSIS 1.312... >>> print(hit.lch_similarity(slap, simulate_root=False)) None >>> print(wn.lch_similarity(hit, slap, simulate_root=False)) None ``synset1.wup_similarity(synset2):`` Wu-Palmer Similarity: Return a score denoting how similar two word senses are, based on the depth of the two senses in the taxonomy and that of their Least Common Subsumer (most specific ancestor node). Note that at this time the scores given do _not_ always agree with those given by Pedersen's Perl implementation of Wordnet Similarity. The LCS does not necessarily feature in the shortest path connecting the two senses, as it is by definition the common ancestor deepest in the taxonomy, not closest to the two senses. Typically, however, it will so feature. Where multiple candidates for the LCS exist, that whose shortest path to the root node is the longest will be selected. Where the LCS has multiple paths to the root, the longer path is used for the purposes of the calculation. >>> dog.wup_similarity(cat) # doctest: +ELLIPSIS 0.857... >>> hit.wup_similarity(slap) 0.25 >>> wn.wup_similarity(hit, slap) 0.25 >>> print(hit.wup_similarity(slap, simulate_root=False)) None >>> print(wn.wup_similarity(hit, slap, simulate_root=False)) None ``wordnet_ic`` Information Content: Load an information content file from the wordnet_ic corpus. >>> from nltk.corpus import wordnet_ic >>> brown_ic = wordnet_ic.ic('ic-brown.dat') >>> semcor_ic = wordnet_ic.ic('ic-semcor.dat') Or you can create an information content dictionary from a corpus (or anything that has a words() method). >>> from nltk.corpus import genesis >>> genesis_ic = wn.ic(genesis, False, 0.0) ``synset1.res_similarity(synset2, ic):`` Resnik Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node). Note that for any similarity measure that uses information content, the result is dependent on the corpus used to generate the information content and the specifics of how the information content was created. >>> dog.res_similarity(cat, brown_ic) # doctest: +ELLIPSIS 7.911... >>> dog.res_similarity(cat, genesis_ic) # doctest: +ELLIPSIS 7.204... ``synset1.jcn_similarity(synset2, ic):`` Jiang-Conrath Similarity Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)). >>> dog.jcn_similarity(cat, brown_ic) # doctest: +ELLIPSIS 0.449... >>> dog.jcn_similarity(cat, genesis_ic) # doctest: +ELLIPSIS 0.285... ``synset1.lin_similarity(synset2, ic):`` Lin Similarity: Return a score denoting how similar two word senses are, based on the Information Content (IC) of the Least Common Subsumer (most specific ancestor node) and that of the two input Synsets. The relationship is given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)). >>> dog.lin_similarity(cat, semcor_ic) # doctest: +ELLIPSIS 0.886... --------------------- Access to all Synsets --------------------- Iterate over all the noun synsets: >>> for synset in list(wn.all_synsets('n'))[:10]: ... print(synset) ... Synset('entity.n.01') Synset('physical_entity.n.01') Synset('abstraction.n.06') Synset('thing.n.12') Synset('object.n.01') Synset('whole.n.02') Synset('congener.n.03') Synset('living_thing.n.01') Synset('organism.n.01') Synset('benthos.n.02') Get all synsets for this word, possibly restricted by POS: >>> wn.synsets('dog') # doctest: +ELLIPSIS [Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), ...] >>> wn.synsets('dog', pos='v') [Synset('chase.v.01')] Walk through the noun synsets looking at their hypernyms: >>> from itertools import islice >>> for synset in islice(wn.all_synsets('n'), 5): ... print(synset, synset.hypernyms()) ... Synset('entity.n.01') [] Synset('physical_entity.n.01') [Synset('entity.n.01')] Synset('abstraction.n.06') [Synset('entity.n.01')] Synset('thing.n.12') [Synset('physical_entity.n.01')] Synset('object.n.01') [Synset('physical_entity.n.01')] ------ Morphy ------ Look up forms not in WordNet, with the help of Morphy: >>> wn.morphy('denied', wn.NOUN) >>> print(wn.morphy('denied', wn.VERB)) deny >>> wn.synsets('denied', wn.NOUN) [] >>> wn.synsets('denied', wn.VERB) # doctest: +NORMALIZE_WHITESPACE [Synset('deny.v.01'), Synset('deny.v.02'), Synset('deny.v.03'), Synset('deny.v.04'), Synset('deny.v.05'), Synset('traverse.v.03'), Synset('deny.v.07')] Morphy uses a combination of inflectional ending rules and exception lists to handle a variety of different possibilities: >>> print(wn.morphy('dogs')) dog >>> print(wn.morphy('churches')) church >>> print(wn.morphy('aardwolves')) aardwolf >>> print(wn.morphy('abaci')) abacus >>> print(wn.morphy('book', wn.NOUN)) book >>> wn.morphy('hardrock', wn.ADV) >>> wn.morphy('book', wn.ADJ) >>> wn.morphy('his', wn.NOUN) >>> --------------- Synset Closures --------------- Compute transitive closures of synsets >>> dog = wn.synset('dog.n.01') >>> hypo = lambda s: s.hyponyms() >>> hyper = lambda s: s.hypernyms() >>> list(dog.closure(hypo, depth=1)) == dog.hyponyms() True >>> list(dog.closure(hyper, depth=1)) == dog.hypernyms() True >>> list(dog.closure(hypo)) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS [Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'), Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'), Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'), Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), ...] >>> list(dog.closure(hyper)) # doctest: +NORMALIZE_WHITESPACE [Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'), Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'), Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'), Synset('physical_entity.n.01'), Synset('entity.n.01')] ---------------- Regression Tests ---------------- Bug 85: morphy returns the base form of a word, if it's input is given as a base form for a POS for which that word is not defined: >>> wn.synsets('book', wn.NOUN) [Synset('book.n.01'), Synset('book.n.02'), Synset('record.n.05'), Synset('script.n.01'), Synset('ledger.n.01'), Synset('book.n.06'), Synset('book.n.07'), Synset('koran.n.01'), Synset('bible.n.01'), Synset('book.n.10'), Synset('book.n.11')] >>> wn.synsets('book', wn.ADJ) [] >>> wn.morphy('book', wn.NOUN) 'book' >>> wn.morphy('book', wn.ADJ) Bug 160: wup_similarity breaks when the two synsets have no common hypernym >>> t = wn.synsets('picasso')[0] >>> m = wn.synsets('male')[1] >>> t.wup_similarity(m) # doctest: +ELLIPSIS 0.631... >>> t = wn.synsets('titan')[1] >>> s = wn.synsets('say', wn.VERB)[0] >>> print(t.wup_similarity(s)) None Bug 21: "instance of" not included in LCS (very similar to bug 160) >>> a = wn.synsets("writings")[0] >>> b = wn.synsets("scripture")[0] >>> brown_ic = wordnet_ic.ic('ic-brown.dat') >>> a.jcn_similarity(b, brown_ic) # doctest: +ELLIPSIS 0.175... Bug 221: Verb root IC is zero >>> from nltk.corpus.reader.wordnet import information_content >>> s = wn.synsets('say', wn.VERB)[0] >>> information_content(s, brown_ic) # doctest: +ELLIPSIS 4.623... Bug 161: Comparison between WN keys/lemmas should not be case sensitive >>> k = wn.synsets("jefferson")[0].lemmas()[0].key() >>> wn.lemma_from_key(k) Lemma('jefferson.n.01.Jefferson') >>> wn.lemma_from_key(k.upper()) Lemma('jefferson.n.01.Jefferson') Bug 99: WordNet root_hypernyms gives incorrect results >>> from nltk.corpus import wordnet as wn >>> for s in wn.all_synsets(wn.NOUN): ... if s.root_hypernyms()[0] != wn.synset('entity.n.01'): ... print(s, s.root_hypernyms()) ... >>> Bug 382: JCN Division by zero error >>> tow = wn.synset('tow.v.01') >>> shlep = wn.synset('shlep.v.02') >>> from nltk.corpus import wordnet_ic >>> brown_ic = wordnet_ic.ic('ic-brown.dat') >>> tow.jcn_similarity(shlep, brown_ic) # doctest: +ELLIPSIS 1...e+300 Bug 428: Depth is zero for instance nouns >>> s = wn.synset("lincoln.n.01") >>> s.max_depth() > 0 True Bug 429: Information content smoothing used old reference to all_synsets >>> genesis_ic = wn.ic(genesis, True, 1.0) Bug 430: all_synsets used wrong pos lookup when synsets were cached >>> for ii in wn.all_synsets(): pass >>> for ii in wn.all_synsets(): pass Bug 470: shortest_path_distance ignored instance hypernyms >>> google = wordnet.synsets("google")[0] >>> earth = wordnet.synsets("earth")[0] >>> google.wup_similarity(earth) # doctest: +ELLIPSIS 0.1... Bug 484: similarity metrics returned -1 instead of None for no LCS >>> t = wn.synsets('fly', wn.VERB)[0] >>> s = wn.synsets('say', wn.VERB)[0] >>> print(s.shortest_path_distance(t)) None >>> print(s.path_similarity(t, simulate_root=False)) None >>> print(s.lch_similarity(t, simulate_root=False)) None >>> print(s.wup_similarity(t, simulate_root=False)) None Bug 427: "pants" does not return all the senses it should >>> from nltk.corpus import wordnet >>> wordnet.synsets("pants",'n') [Synset('bloomers.n.01'), Synset('pant.n.01'), Synset('trouser.n.01'), Synset('gasp.n.01')] Bug 482: Some nouns not being lemmatised by WordNetLemmatizer().lemmatize >>> from nltk.stem.wordnet import WordNetLemmatizer >>> WordNetLemmatizer().lemmatize("eggs", pos="n") 'egg' >>> WordNetLemmatizer().lemmatize("legs", pos="n") 'leg' Bug 284: instance hypernyms not used in similarity calculations >>> wn.synset('john.n.02').lch_similarity(wn.synset('dog.n.01')) # doctest: +ELLIPSIS 1.335... >>> wn.synset('john.n.02').wup_similarity(wn.synset('dog.n.01')) # doctest: +ELLIPSIS 0.571... >>> wn.synset('john.n.02').res_similarity(wn.synset('dog.n.01'), brown_ic) # doctest: +ELLIPSIS 2.224... >>> wn.synset('john.n.02').jcn_similarity(wn.synset('dog.n.01'), brown_ic) # doctest: +ELLIPSIS 0.075... >>> wn.synset('john.n.02').lin_similarity(wn.synset('dog.n.01'), brown_ic) # doctest: +ELLIPSIS 0.252... >>> wn.synset('john.n.02').hypernym_paths() # doctest: +ELLIPSIS [[Synset('entity.n.01'), ..., Synset('john.n.02')]] Issue 541: add domains to wordnet >>> wn.synset('code.n.03').topic_domains() [Synset('computer_science.n.01')] >>> wn.synset('pukka.a.01').region_domains() [Synset('india.n.01')] >>> wn.synset('freaky.a.01').usage_domains() [Synset('slang.n.02')] Issue 629: wordnet failures when python run with -O optimizations >>> # Run the test suite with python -O to check this >>> wn.synsets("brunch") [Synset('brunch.n.01'), Synset('brunch.v.01')] Issue 395: wordnet returns incorrect result for lowest_common_hypernyms of chef and policeman >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) [Synset('person.n.01')] nltk-3.1/nltk/test/wordnet_fixt.py0000644000076500000240000000023412574600335017075 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- from __future__ import absolute_import def teardown_module(module=None): from nltk.corpus import wordnet wordnet._unload() nltk-3.1/nltk/test/wordnet_lch.doctest0000644000076500000240000000440712607224144017711 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT =============================== WordNet Lowest Common Hypernyms =============================== Wordnet's lowest_common_hypernyms() method is based used to locate the lowest single hypernym that is shared by two given words: >>> from nltk.corpus import wordnet as wn >>> wn.synset('kin.n.01').lowest_common_hypernyms(wn.synset('mother.n.01')) [Synset('relative.n.01')] >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) [Synset('person.n.01')] This method generally returns a single result, but in some cases, more than one valid LCH is possible: >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01')) [Synset('attribute.n.02'), Synset('measure.n.02')] In some cases, lowest_common_hypernyms() can return one of the synsets which was passed to it as an argument: >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02')) [Synset('woman.n.01')] In NLTK 3.0a2 the behavior of lowest_common_hypernyms() was changed to give more accurate results in a small set of cases, generally when dealing with nouns describing social roles or jobs. To emulate the pre v3.0a2 behavior, you can set the use_min_depth=True flag: >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01')) [Synset('person.n.01')] >>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'), use_min_depth=True) [Synset('organism.n.01')] In some cases use_min_depth=True may return more or fewer results than the default behavior: >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02')) [Synset('woman.n.01')] >>> wn.synset('woman.n.01').lowest_common_hypernyms(wn.synset('girlfriend.n.02'), use_min_depth=True) [Synset('organism.n.01'), Synset('woman.n.01')] In the general case, however, they tend to return the same results: >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01')) [Synset('attribute.n.02'), Synset('measure.n.02')] >>> wn.synset('body.n.09').lowest_common_hypernyms(wn.synset('sidereal_day.n.01'), use_min_depth=True) [Synset('attribute.n.02'), Synset('measure.n.02')] nltk-3.1/nltk/test/wsd.doctest0000644000076500000240000000560212607224144016174 0ustar sbstaff00000000000000.. Copyright (C) 2001-2015 NLTK Project .. For license information, see LICENSE.TXT .. -*- coding: utf-8 -*- ========================= Word Sense Disambiguation ========================= Lesk Algorithm -------------- Performs the classic Lesk algorithm for Word Sense Disambiguation (WSD) using a the definitions of the ambiguous word. Given an ambiguous word and the context in which the word occurs, Lesk returns a Synset with the highest number of overlapping words between the context sentence and different definitions from each Synset. >>> from nltk.wsd import lesk >>> sent = ['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'] >>> print(lesk(sent, 'bank', 'n')) Synset('savings_bank.n.02') >>> print(lesk(sent, 'bank')) Synset('savings_bank.n.02') The definitions for "bank" are: >>> from nltk.corpus import wordnet as wn >>> for ss in wn.synsets('bank'): ... print(ss, ss.definition()) ... Synset('bank.n.01') sloping land (especially the slope beside a body of water) Synset('depository_financial_institution.n.01') a financial institution that accepts deposits and channels the money into lending activities Synset('bank.n.03') a long ridge or pile Synset('bank.n.04') an arrangement of similar objects in a row or in tiers Synset('bank.n.05') a supply or stock held in reserve for future use (especially in emergencies) Synset('bank.n.06') the funds held by a gambling house or the dealer in some gambling games Synset('bank.n.07') a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force Synset('savings_bank.n.02') a container (usually with a slot in the top) for keeping money at home Synset('bank.n.09') a building in which the business of banking transacted Synset('bank.n.10') a flight maneuver; aircraft tips laterally about its longitudinal axis (especially in turning) Synset('bank.v.01') tip laterally Synset('bank.v.02') enclose with a bank Synset('bank.v.03') do business with a bank or keep an account at a bank Synset('bank.v.04') act as the banker in a game or in gambling Synset('bank.v.05') be in the banking business Synset('deposit.v.02') put into a bank account Synset('bank.v.07') cover with ashes so to control the rate of burning Synset('trust.v.01') have confidence or faith in Test disambiguation of POS tagged `able`. >>> [(s, s.pos()) for s in wn.synsets('able')] [(Synset('able.a.01'), 'a'), (Synset('able.s.02'), 's'), (Synset('able.s.03'), 's'), (Synset('able.s.04'), 's')] >>> sent = 'people should be able to marry a person of their choice'.split() >>> lesk(sent, 'able') Synset('able.s.04') >>> lesk(sent, 'able', pos='a') Synset('able.a.01') Test behavior if there is are no matching senses. >>> lesk('John loves Mary'.split(), 'loves', synsets=[]) nltk-3.1/nltk/text.py0000644000076500000240000005401412607224144014370 0ustar sbstaff00000000000000# Natural Language Toolkit: Texts # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # URL: # For license information, see LICENSE.TXT """ This module brings together a variety of NLTK functionality for text analysis, and provides simple, interactive interfaces. Functionality includes: concordancing, collocation discovery, regular expression search over tokenized strings, and distributional similarity. """ from __future__ import print_function, division, unicode_literals from math import log from collections import defaultdict from functools import reduce from itertools import islice import re from nltk.probability import FreqDist, LidstoneProbDist from nltk.probability import ConditionalFreqDist as CFD from nltk.util import tokenwrap, LazyConcatenation from nltk.metrics import f_measure, BigramAssocMeasures from nltk.collocations import BigramCollocationFinder from nltk.compat import python_2_unicode_compatible, text_type, Counter class ContextIndex(object): """ A bidirectional index between words and their 'contexts' in a text. The context of a word is usually defined to be the words that occur in a fixed window around the word; but other definitions may also be used by providing a custom context function. """ @staticmethod def _default_context(tokens, i): """One left token and one right token, normalized to lowercase""" left = (tokens[i-1].lower() if i != 0 else '*START*') right = (tokens[i+1].lower() if i != len(tokens) - 1 else '*END*') return (left, right) def __init__(self, tokens, context_func=None, filter=None, key=lambda x:x): self._key = key self._tokens = tokens if context_func: self._context_func = context_func else: self._context_func = self._default_context if filter: tokens = [t for t in tokens if filter(t)] self._word_to_contexts = CFD((self._key(w), self._context_func(tokens, i)) for i, w in enumerate(tokens)) self._context_to_words = CFD((self._context_func(tokens, i), self._key(w)) for i, w in enumerate(tokens)) def tokens(self): """ :rtype: list(str) :return: The document that this context index was created from. """ return self._tokens def word_similarity_dict(self, word): """ Return a dictionary mapping from words to 'similarity scores,' indicating how often these two words occur in the same context. """ word = self._key(word) word_contexts = set(self._word_to_contexts[word]) scores = {} for w, w_contexts in self._word_to_contexts.items(): scores[w] = f_measure(word_contexts, set(w_contexts)) return scores def similar_words(self, word, n=20): scores = defaultdict(int) for c in self._word_to_contexts[self._key(word)]: for w in self._context_to_words[c]: if w != word: scores[w] += self._context_to_words[c][word] * self._context_to_words[c][w] return sorted(scores, key=scores.get, reverse=True)[:n] def common_contexts(self, words, fail_on_unknown=False): """ Find contexts where the specified words can all appear; and return a frequency distribution mapping each context to the number of times that context was used. :param words: The words used to seed the similarity search :type words: str :param fail_on_unknown: If true, then raise a value error if any of the given words do not occur at all in the index. """ words = [self._key(w) for w in words] contexts = [set(self._word_to_contexts[w]) for w in words] empty = [words[i] for i in range(len(words)) if not contexts[i]] common = reduce(set.intersection, contexts) if empty and fail_on_unknown: raise ValueError("The following word(s) were not found:", " ".join(words)) elif not common: # nothing in common -- just return an empty freqdist. return FreqDist() else: fd = FreqDist(c for w in words for c in self._word_to_contexts[w] if c in common) return fd @python_2_unicode_compatible class ConcordanceIndex(object): """ An index that can be used to look up the offset locations at which a given word occurs in a document. """ def __init__(self, tokens, key=lambda x:x): """ Construct a new concordance index. :param tokens: The document (list of tokens) that this concordance index was created from. This list can be used to access the context of a given word occurrence. :param key: A function that maps each token to a normalized version that will be used as a key in the index. E.g., if you use ``key=lambda s:s.lower()``, then the index will be case-insensitive. """ self._tokens = tokens """The document (list of tokens) that this concordance index was created from.""" self._key = key """Function mapping each token to an index key (or None).""" self._offsets = defaultdict(list) """Dictionary mapping words (or keys) to lists of offset indices.""" # Initialize the index (self._offsets) for index, word in enumerate(tokens): word = self._key(word) self._offsets[word].append(index) def tokens(self): """ :rtype: list(str) :return: The document that this concordance index was created from. """ return self._tokens def offsets(self, word): """ :rtype: list(int) :return: A list of the offset positions at which the given word occurs. If a key function was specified for the index, then given word's key will be looked up. """ word = self._key(word) return self._offsets[word] def __repr__(self): return '' % ( len(self._tokens), len(self._offsets)) def print_concordance(self, word, width=75, lines=25): """ Print a concordance for ``word`` with the specified context window. :param word: The target word :type word: str :param width: The width of each line, in characters (default=80) :type width: int :param lines: The number of lines to display (default=25) :type lines: int """ half_width = (width - len(word) - 2) // 2 context = width // 4 # approx number of words of context offsets = self.offsets(word) if offsets: lines = min(lines, len(offsets)) print("Displaying %s of %s matches:" % (lines, len(offsets))) for i in offsets: if lines <= 0: break left = (' ' * half_width + ' '.join(self._tokens[i-context:i])) right = ' '.join(self._tokens[i+1:i+context]) left = left[-half_width:] right = right[:half_width] print(left, self._tokens[i], right) lines -= 1 else: print("No matches") class TokenSearcher(object): """ A class that makes it easier to use regular expressions to search over tokenized strings. The tokenized string is converted to a string where tokens are marked with angle brackets -- e.g., ``''``. The regular expression passed to the ``findall()`` method is modified to treat angle brackets as non-capturing parentheses, in addition to matching the token boundaries; and to have ``'.'`` not match the angle brackets. """ def __init__(self, tokens): self._raw = ''.join('<'+w+'>' for w in tokens) def findall(self, regexp): """ Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> from nltk.text import TokenSearcher >>> print('hack'); from nltk.book import text1, text5, text9 hack... >>> text5.findall("<.*><.*>") you rule bro; telling you bro; u twizted bro >>> text1.findall("(<.*>)") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str """ # preprocess the regular expression regexp = re.sub(r'\s', '', regexp) regexp = re.sub(r'<', '(?:<(?:', regexp) regexp = re.sub(r'>', ')>)', regexp) regexp = re.sub(r'(?]', regexp) # perform the search hits = re.findall(regexp, self._raw) # Sanity check for h in hits: if not h.startswith('<') and h.endswith('>'): raise ValueError('Bad regexp for TokenSearcher.findall') # postprocess the output hits = [h[1:-1].split('><') for h in hits] return hits @python_2_unicode_compatible class Text(object): """ A wrapper around a sequence of simple (string) tokens, which is intended to support initial exploration of texts (via the interactive console). Its methods perform a variety of analyses on the text's contexts (e.g., counting, concordancing, collocation discovery), and display the results. If you wish to write a program which makes use of these analyses, then you should bypass the ``Text`` class, and use the appropriate analysis function or class directly instead. A ``Text`` is typically initialized from a given document or corpus. E.g.: >>> import nltk.corpus >>> from nltk.text import Text >>> moby = Text(nltk.corpus.gutenberg.words('melville-moby_dick.txt')) """ # This defeats lazy loading, but makes things faster. This # *shouldn't* be necessary because the corpus view *should* be # doing intelligent caching, but without this it's running slow. # Look into whether the caching is working correctly. _COPY_TOKENS = True def __init__(self, tokens, name=None): """ Create a Text object. :param tokens: The source text. :type tokens: sequence of str """ if self._COPY_TOKENS: tokens = list(tokens) self.tokens = tokens if name: self.name = name elif ']' in tokens[:20]: end = tokens[:20].index(']') self.name = " ".join(text_type(tok) for tok in tokens[1:end]) else: self.name = " ".join(text_type(tok) for tok in tokens[:8]) + "..." #//////////////////////////////////////////////////////////// # Support item & slice access #//////////////////////////////////////////////////////////// def __getitem__(self, i): if isinstance(i, slice): return self.tokens[i.start:i.stop] else: return self.tokens[i] def __len__(self): return len(self.tokens) #//////////////////////////////////////////////////////////// # Interactive console methods #//////////////////////////////////////////////////////////// def concordance(self, word, width=79, lines=25): """ Print a concordance for ``word`` with the specified context window. Word matching is not case-sensitive. :seealso: ``ConcordanceIndex`` """ if '_concordance_index' not in self.__dict__: #print("Building index...") self._concordance_index = ConcordanceIndex(self.tokens, key=lambda s:s.lower()) self._concordance_index.print_concordance(word, width, lines) def collocations(self, num=20, window_size=2): """ Print collocations derived from the text, ignoring stopwords. :seealso: find_collocations :param num: The maximum number of collocations to print. :type num: int :param window_size: The number of tokens spanned by a collocation (default=2) :type window_size: int """ if not ('_collocations' in self.__dict__ and self._num == num and self._window_size == window_size): self._num = num self._window_size = window_size #print("Building collocations list") from nltk.corpus import stopwords ignored_words = stopwords.words('english') finder = BigramCollocationFinder.from_words(self.tokens, window_size) finder.apply_freq_filter(2) finder.apply_word_filter(lambda w: len(w) < 3 or w.lower() in ignored_words) bigram_measures = BigramAssocMeasures() self._collocations = finder.nbest(bigram_measures.likelihood_ratio, num) colloc_strings = [w1+' '+w2 for w1, w2 in self._collocations] print(tokenwrap(colloc_strings, separator="; ")) def count(self, word): """ Count the number of times this word appears in the text. """ return self.tokens.count(word) def index(self, word): """ Find the index of the first occurrence of the word in the text. """ return self.tokens.index(word) def readability(self, method): # code from nltk_contrib.readability raise NotImplementedError def similar(self, word, num=20): """ Distributional similarity: find other words which appear in the same contexts as the specified word; list most similar words first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.similar_words() """ if '_word_context_index' not in self.__dict__: #print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, filter=lambda x:x.isalpha(), key=lambda s:s.lower()) # words = self._word_context_index.similar_words(word, num) word = word.lower() wci = self._word_context_index._word_to_contexts if word in wci.conditions(): contexts = set(wci[word]) fd = Counter(w for w in wci.conditions() for c in wci[w] if c in contexts and not w == word) words = [w for w, _ in fd.most_common(num)] print(tokenwrap(words)) else: print("No matches") def common_contexts(self, words, num=20): """ Find contexts where the specified words appear; list most frequent common contexts first. :param word: The word used to seed the similarity search :type word: str :param num: The number of words to generate (default=20) :type num: int :seealso: ContextIndex.common_contexts() """ if '_word_context_index' not in self.__dict__: #print('Building word-context index...') self._word_context_index = ContextIndex(self.tokens, key=lambda s:s.lower()) try: fd = self._word_context_index.common_contexts(words, True) if not fd: print("No common contexts were found") else: ranked_contexts = [w for w, _ in fd.most_common(num)] print(tokenwrap(w1+"_"+w2 for w1,w2 in ranked_contexts)) except ValueError as e: print(e) def dispersion_plot(self, words): """ Produce a plot showing the distribution of the words through the text. Requires pylab to be installed. :param words: The words to be plotted :type words: list(str) :seealso: nltk.draw.dispersion_plot() """ from nltk.draw import dispersion_plot dispersion_plot(self, words) def plot(self, *args): """ See documentation for FreqDist.plot() :seealso: nltk.prob.FreqDist.plot() """ self.vocab().plot(*args) def vocab(self): """ :seealso: nltk.prob.FreqDist """ if "_vocab" not in self.__dict__: #print("Building vocabulary index...") self._vocab = FreqDist(self) return self._vocab def findall(self, regexp): """ Find instances of the regular expression in the text. The text is a list of tokens, and a regexp pattern to match a single token must be surrounded by angle brackets. E.g. >>> print('hack'); from nltk.book import text1, text5, text9 hack... >>> text5.findall("<.*><.*>") you rule bro; telling you bro; u twizted bro >>> text1.findall("(<.*>)") monied; nervous; dangerous; white; white; white; pious; queer; good; mature; white; Cape; great; wise; wise; butterless; white; fiendish; pale; furious; better; certain; complete; dismasted; younger; brave; brave; brave; brave >>> text9.findall("{3,}") thread through those; the thought that; that the thing; the thing that; that that thing; through these than through; them that the; through the thick; them that they; thought that the :param regexp: A regular expression :type regexp: str """ if "_token_searcher" not in self.__dict__: self._token_searcher = TokenSearcher(self) hits = self._token_searcher.findall(regexp) hits = [' '.join(h) for h in hits] print(tokenwrap(hits, "; ")) #//////////////////////////////////////////////////////////// # Helper Methods #//////////////////////////////////////////////////////////// _CONTEXT_RE = re.compile('\w+|[\.\!\?]') def _context(self, tokens, i): """ One left & one right token, both case-normalized. Skip over non-sentence-final punctuation. Used by the ``ContextIndex`` that is created for ``similar()`` and ``common_contexts()``. """ # Left context j = i-1 while j>=0 and not self._CONTEXT_RE.match(tokens[j]): j -= 1 left = (tokens[j] if j != 0 else '*START*') # Right context j = i+1 while j' % self.name def __repr__(self): return '' % self.name # Prototype only; this approach will be slow to load class TextCollection(Text): """A collection of texts, which can be loaded with list of texts, or with a corpus consisting of one or more texts, and which supports counting, concordancing, collocation discovery, etc. Initialize a TextCollection as follows: >>> import nltk.corpus >>> from nltk.text import TextCollection >>> print('hack'); from nltk.book import text1, text2, text3 hack... >>> gutenberg = TextCollection(nltk.corpus.gutenberg) >>> mytexts = TextCollection([text1, text2, text3]) Iterating over a TextCollection produces all the tokens of all the texts in order. """ def __init__(self, source): if hasattr(source, 'words'): # bridge to the text corpus reader source = [source.words(f) for f in source.fileids()] self._texts = source Text.__init__(self, LazyConcatenation(source)) self._idf_cache = {} def tf(self, term, text): """ The frequency of the term in text. """ return text.count(term) / len(text) def idf(self, term): """ The number of texts in the corpus divided by the number of texts that the term appears in. If a term does not appear in the corpus, 0.0 is returned. """ # idf values are cached for performance. idf = self._idf_cache.get(term) if idf is None: matches = len([True for text in self._texts if term in text]) # FIXME Should this raise some kind of error instead? idf = (log(float(len(self._texts)) / matches) if matches else 0.0) self._idf_cache[term] = idf return idf def tf_idf(self, term, text): return self.tf(term, text) * self.idf(term) def demo(): from nltk.corpus import brown text = Text(brown.words(categories='news')) print(text) print() print("Concordance:") text.concordance('news') print() print("Distributionally similar words:") text.similar('news') print() print("Collocations:") text.collocations() print() #print("Automatically generated text:") #text.generate() #print() print("Dispersion plot:") text.dispersion_plot(['news', 'report', 'said', 'announced']) print() print("Vocabulary plot:") text.plot(50) print() print("Indexing:") print("text[3]:", text[3]) print("text[3:5]:", text[3:5]) print("text.vocab()['news']:", text.vocab()['news']) if __name__ == '__main__': demo() __all__ = ["ContextIndex", "ConcordanceIndex", "TokenSearcher", "Text", "TextCollection"] nltk-3.1/nltk/tgrep.py0000644000076500000240000011372212607224144014527 0ustar sbstaff00000000000000#!/usr/bin/env python # -*- coding: utf-8 -*- # # Natural Language Toolkit: TGrep search # # Copyright (C) 2001-2015 NLTK Project # Author: Will Roberts # URL: # For license information, see LICENSE.TXT ''' ============================================ TGrep search implementation for NLTK trees ============================================ This module supports TGrep2 syntax for matching parts of NLTK Trees. Note that many tgrep operators require the tree passed to be a ``ParentedTree``. External links: - `Tgrep tutorial `_ - `Tgrep2 manual `_ - `Tgrep2 source `_ Usage ===== >>> from nltk.tree import ParentedTree >>> from nltk.tgrep import tgrep_nodes, tgrep_positions >>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') >>> list(tgrep_nodes('NN', [tree])) [[ParentedTree('NN', ['dog']), ParentedTree('NN', ['cat'])]] >>> list(tgrep_positions('NN', [tree])) [[(0, 2), (2, 1)]] >>> list(tgrep_nodes('DT', [tree])) [[ParentedTree('DT', ['the']), ParentedTree('DT', ['a'])]] >>> list(tgrep_nodes('DT $ JJ', [tree])) [[ParentedTree('DT', ['the'])]] This implementation adds syntax to select nodes based on their NLTK tree position. This syntax is ``N`` plus a Python tuple representing the tree position. For instance, ``N()``, ``N(0,)``, ``N(0,0)`` are valid node selectors. Example: >>> tree = ParentedTree.fromstring('(S (NP (DT the) (JJ big) (NN dog)) (VP bit) (NP (DT a) (NN cat)))') >>> tree[0,0] ParentedTree('DT', ['the']) >>> tree[0,0].treeposition() (0, 0) >>> list(tgrep_nodes('N(0,0)', [tree])) [[ParentedTree('DT', ['the'])]] Caveats: ======== - Link modifiers: "?" and "=" are not implemented. - Tgrep compatibility: Using "@" for "!", "{" for "<", "}" for ">" are not implemented. - The "=" and "~" links are not implemented. Known Issues: ============= - There are some issues with link relations involving leaf nodes (which are represented as bare strings in NLTK trees). For instance, consider the tree:: (S (A x)) The search string ``* !>> S`` should select all nodes which are not dominated in some way by an ``S`` node (i.e., all nodes which are not descendants of an ``S``). Clearly, in this tree, the only node which fulfills this criterion is the top node (since it is not dominated by anything). However, the code here will find both the top node and the leaf node ``x``. This is because we cannot recover the parent of the leaf, since it is stored as a bare string. A possible workaround, when performing this kind of search, would be to filter out all leaf nodes. Implementation notes ==================== This implementation is (somewhat awkwardly) based on lambda functions which are predicates on a node. A predicate is a function which is either True or False; using a predicate function, we can identify sets of nodes with particular properties. A predicate function, could, for instance, return True only if a particular node has a label matching a particular regular expression, and has a daughter node which has no sisters. Because tgrep2 search strings can do things statefully (such as substituting in macros, and binding nodes with node labels), the actual predicate function is declared with three arguments:: pred = lambda n, m, l: return True # some logic here ``n`` is a node in a tree; this argument must always be given ``m`` contains a dictionary, mapping macro names onto predicate functions ``l`` is a dictionary to map node labels onto nodes in the tree ``m`` and ``l`` are declared to default to ``None``, and so need not be specified in a call to a predicate. Predicates which call other predicates must always pass the value of these arguments on. The top-level predicate (constructed by ``_tgrep_exprs_action``) binds the macro definitions to ``m`` and initialises ``l`` to an empty dictionary. ''' from __future__ import absolute_import, print_function, unicode_literals from nltk.compat import binary_type, text_type import functools import nltk.tree try: import pyparsing except ImportError: print('Warning: nltk.tgrep will not work without the `pyparsing` package') print('installed.') import re class TgrepException(Exception): '''Tgrep exception type.''' pass def ancestors(node): ''' Returns the list of all nodes dominating the given tree node. This method will not work with leaf nodes, since there is no way to recover the parent. ''' results = [] try: current = node.parent() except AttributeError: # if node is a leaf, we cannot retrieve its parent return results while current: results.append(current) current = current.parent() return results def unique_ancestors(node): ''' Returns the list of all nodes dominating the given node, where there is only a single path of descent. ''' results = [] try: current = node.parent() except AttributeError: # if node is a leaf, we cannot retrieve its parent return results while current and len(current) == 1: results.append(current) current = current.parent() return results def _descendants(node): ''' Returns the list of all nodes which are descended from the given tree node in some way. ''' try: treepos = node.treepositions() except AttributeError: return [] return [node[x] for x in treepos[1:]] def _leftmost_descendants(node): ''' Returns the set of all nodes descended in some way through left branches from this node. ''' try: treepos = node.treepositions() except AttributeError: return [] return [node[x] for x in treepos[1:] if all(y == 0 for y in x)] def _rightmost_descendants(node): ''' Returns the set of all nodes descended in some way through right branches from this node. ''' try: rightmost_leaf = max(node.treepositions()) except AttributeError: return [] return [node[rightmost_leaf[:i]] for i in range(1, len(rightmost_leaf) + 1)] def _istree(obj): '''Predicate to check whether `obj` is a nltk.tree.Tree.''' return isinstance(obj, nltk.tree.Tree) def _unique_descendants(node): ''' Returns the list of all nodes descended from the given node, where there is only a single path of descent. ''' results = [] current = node while current and _istree(current) and len(current) == 1: current = current[0] results.append(current) return results def _before(node): ''' Returns the set of all nodes that are before the given node. ''' try: pos = node.treeposition() tree = node.root() except AttributeError: return [] return [tree[x] for x in tree.treepositions() if x[:len(pos)] < pos[:len(x)]] def _immediately_before(node): ''' Returns the set of all nodes that are immediately before the given node. Tree node A immediately precedes node B if the last terminal symbol (word) produced by A immediately precedes the first terminal symbol produced by B. ''' try: pos = node.treeposition() tree = node.root() except AttributeError: return [] # go "upwards" from pos until there is a place we can go to the left idx = len(pos) - 1 while 0 <= idx and pos[idx] == 0: idx -= 1 if idx < 0: return [] pos = list(pos[:idx + 1]) pos[-1] -= 1 before = tree[pos] return [before] + _rightmost_descendants(before) def _after(node): ''' Returns the set of all nodes that are after the given node. ''' try: pos = node.treeposition() tree = node.root() except AttributeError: return [] return [tree[x] for x in tree.treepositions() if x[:len(pos)] > pos[:len(x)]] def _immediately_after(node): ''' Returns the set of all nodes that are immediately after the given node. Tree node A immediately follows node B if the first terminal symbol (word) produced by A immediately follows the last terminal symbol produced by B. ''' try: pos = node.treeposition() tree = node.root() current = node.parent() except AttributeError: return [] # go "upwards" from pos until there is a place we can go to the # right idx = len(pos) - 1 while 0 <= idx and pos[idx] == len(current) - 1: idx -= 1 current = current.parent() if idx < 0: return [] pos = list(pos[:idx + 1]) pos[-1] += 1 after = tree[pos] return [after] + _leftmost_descendants(after) def _tgrep_node_literal_value(node): ''' Gets the string value of a given parse tree node, for comparison using the tgrep node literal predicates. ''' return (node.label() if _istree(node) else text_type(node)) def _tgrep_macro_use_action(_s, _l, tokens): ''' Builds a lambda function which looks up the macro name used. ''' assert len(tokens) == 1 assert tokens[0][0] == '@' macro_name = tokens[0][1:] def macro_use(n, m=None, l=None): if m is None or macro_name not in m: raise TgrepException('macro {0} not defined'.format(macro_name)) return m[macro_name](n, m, l) return macro_use def _tgrep_node_action(_s, _l, tokens): ''' Builds a lambda function representing a predicate on a tree node depending on the name of its node. ''' # print 'node tokens: ', tokens if tokens[0] == "'": # strip initial apostrophe (tgrep2 print command) tokens = tokens[1:] if len(tokens) > 1: # disjunctive definition of a node name assert list(set(tokens[1::2])) == ['|'] # recursively call self to interpret each node name definition tokens = [_tgrep_node_action(None, None, [node]) for node in tokens[::2]] # capture tokens and return the disjunction return (lambda t: lambda n, m=None, l=None: any(f(n, m, l) for f in t))(tokens) else: if hasattr(tokens[0], '__call__'): # this is a previously interpreted parenthetical node # definition (lambda function) return tokens[0] elif tokens[0] == '*' or tokens[0] == '__': return lambda n, m=None, l=None: True elif tokens[0].startswith('"'): assert tokens[0].endswith('"') node_lit = tokens[0][1:-1].replace('\\"', '"').replace('\\\\', '\\') return (lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s)(node_lit) elif tokens[0].startswith('/'): assert tokens[0].endswith('/') node_lit = tokens[0][1:-1] return (lambda r: lambda n, m=None, l=None: r.search(_tgrep_node_literal_value(n)))(re.compile(node_lit)) elif tokens[0].startswith('i@'): node_func = _tgrep_node_action(_s, _l, [tokens[0][2:].lower()]) return (lambda f: lambda n, m=None, l=None: f(_tgrep_node_literal_value(n).lower()))(node_func) else: return (lambda s: lambda n, m=None, l=None: _tgrep_node_literal_value(n) == s)(tokens[0]) def _tgrep_parens_action(_s, _l, tokens): ''' Builds a lambda function representing a predicate on a tree node from a parenthetical notation. ''' # print 'parenthetical tokens: ', tokens assert len(tokens) == 3 assert tokens[0] == '(' assert tokens[2] == ')' return tokens[1] def _tgrep_nltk_tree_pos_action(_s, _l, tokens): ''' Builds a lambda function representing a predicate on a tree node which returns true if the node is located at a specific tree position. ''' # recover the tuple from the parsed sting node_tree_position = tuple(int(x) for x in tokens if x.isdigit()) # capture the node's tree position return (lambda i: lambda n, m=None, l=None: (hasattr(n, 'treeposition') and n.treeposition() == i))(node_tree_position) def _tgrep_relation_action(_s, _l, tokens): ''' Builds a lambda function representing a predicate on a tree node depending on its relation to other nodes in the tree. ''' # print 'relation tokens: ', tokens # process negation first if needed negated = False if tokens[0] == '!': negated = True tokens = tokens[1:] if tokens[0] == '[': # process square-bracketed relation expressions assert len(tokens) == 3 assert tokens[2] == ']' retval = tokens[1] else: # process operator-node relation expressions assert len(tokens) == 2 operator, predicate = tokens # A < B A is the parent of (immediately dominates) B. if operator == '<': retval = lambda n, m=None, l=None: (_istree(n) and any(predicate(x, m, l) for x in n)) # A > B A is the child of B. elif operator == '>': retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and bool(n.parent()) and predicate(n.parent(), m, l)) # A <, B Synonymous with A <1 B. elif operator == '<,' or operator == '<1': retval = lambda n, m=None, l=None: (_istree(n) and bool(list(n)) and predicate(n[0], m, l)) # A >, B Synonymous with A >1 B. elif operator == '>,' or operator == '>1': retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and bool(n.parent()) and (n is n.parent()[0]) and predicate(n.parent(), m, l)) # A N B A is the Nth child of B (the first child is >1). elif operator[0] == '>' and operator[1:].isdigit(): idx = int(operator[1:]) # capture the index parameter retval = (lambda i: lambda n, m=None, l=None: (hasattr(n, 'parent') and bool(n.parent()) and 0 <= i < len(n.parent()) and (n is n.parent()[i]) and predicate(n.parent(), m, l)))(idx - 1) # A <' B B is the last child of A (also synonymous with A <-1 B). # A <- B B is the last child of A (synonymous with A <-1 B). elif operator == '<\'' or operator == '<-' or operator == '<-1': retval = lambda n, m=None, l=None: (_istree(n) and bool(list(n)) and predicate(n[-1], m, l)) # A >' B A is the last child of B (also synonymous with A >-1 B). # A >- B A is the last child of B (synonymous with A >-1 B). elif operator == '>\'' or operator == '>-' or operator == '>-1': retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and bool(n.parent()) and (n is n.parent()[-1]) and predicate(n.parent(), m, l)) # A <-N B B is the N th-to-last child of A (the last child is <-1). elif operator[:2] == '<-' and operator[2:].isdigit(): idx = -int(operator[2:]) # capture the index parameter retval = (lambda i: lambda n, m=None, l=None: (_istree(n) and bool(list(n)) and 0 <= (i + len(n)) < len(n) and predicate(n[i + len(n)], m, l)))(idx) # A >-N B A is the N th-to-last child of B (the last child is >-1). elif operator[:2] == '>-' and operator[2:].isdigit(): idx = -int(operator[2:]) # capture the index parameter retval = (lambda i: lambda n, m=None, l=None: (hasattr(n, 'parent') and bool(n.parent()) and 0 <= (i + len(n.parent())) < len(n.parent()) and (n is n.parent()[i + len(n.parent())]) and predicate(n.parent(), m, l)))(idx) # A <: B B is the only child of A elif operator == '<:': retval = lambda n, m=None, l=None: (_istree(n) and len(n) == 1 and predicate(n[0], m, l)) # A >: B A is the only child of B. elif operator == '>:': retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and bool(n.parent()) and len(n.parent()) == 1 and predicate(n.parent(), m, l)) # A << B A dominates B (A is an ancestor of B). elif operator == '<<': retval = lambda n, m=None, l=None: (_istree(n) and any(predicate(x, m, l) for x in _descendants(n))) # A >> B A is dominated by B (A is a descendant of B). elif operator == '>>': retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in ancestors(n)) # A <<, B B is a left-most descendant of A. elif operator == '<<,' or operator == '<<1': retval = lambda n, m=None, l=None: (_istree(n) and any(predicate(x, m, l) for x in _leftmost_descendants(n))) # A >>, B A is a left-most descendant of B. elif operator == '>>,': retval = lambda n, m=None, l=None: any((predicate(x, m, l) and n in _leftmost_descendants(x)) for x in ancestors(n)) # A <<' B B is a right-most descendant of A. elif operator == '<<\'': retval = lambda n, m=None, l=None: (_istree(n) and any(predicate(x, m, l) for x in _rightmost_descendants(n))) # A >>' B A is a right-most descendant of B. elif operator == '>>\'': retval = lambda n, m=None, l=None: any((predicate(x, m, l) and n in _rightmost_descendants(x)) for x in ancestors(n)) # A <<: B There is a single path of descent from A and B is on it. elif operator == '<<:': retval = lambda n, m=None, l=None: (_istree(n) and any(predicate(x, m, l) for x in _unique_descendants(n))) # A >>: B There is a single path of descent from B and A is on it. elif operator == '>>:': retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in unique_ancestors(n)) # A . B A immediately precedes B. elif operator == '.': retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _immediately_after(n)) # A , B A immediately follows B. elif operator == ',': retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _immediately_before(n)) # A .. B A precedes B. elif operator == '..': retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _after(n)) # A ,, B A follows B. elif operator == ',,': retval = lambda n, m=None, l=None: any(predicate(x, m, l) for x in _before(n)) # A $ B A is a sister of B (and A != B). elif operator == '$' or operator == '%': retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and bool(n.parent()) and any(predicate(x, m, l) for x in n.parent() if x is not n)) # A $. B A is a sister of and immediately precedes B. elif operator == '$.' or operator == '%.': retval = lambda n, m=None, l=None: (hasattr(n, 'right_sibling') and bool(n.right_sibling()) and predicate(n.right_sibling(), m, l)) # A $, B A is a sister of and immediately follows B. elif operator == '$,' or operator == '%,': retval = lambda n, m=None, l=None: (hasattr(n, 'left_sibling') and bool(n.left_sibling()) and predicate(n.left_sibling(), m, l)) # A $.. B A is a sister of and precedes B. elif operator == '$..' or operator == '%..': retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and hasattr(n, 'parent_index') and bool(n.parent()) and any(predicate(x, m, l) for x in n.parent()[n.parent_index() + 1:])) # A $,, B A is a sister of and follows B. elif operator == '$,,' or operator == '%,,': retval = lambda n, m=None, l=None: (hasattr(n, 'parent') and hasattr(n, 'parent_index') and bool(n.parent()) and any(predicate(x, m, l) for x in n.parent()[:n.parent_index()])) else: raise TgrepException( 'cannot interpret tgrep operator "{0}"'.format(operator)) # now return the built function if negated: return (lambda r: (lambda n, m=None, l=None: not r(n, m, l)))(retval) else: return retval def _tgrep_conjunction_action(_s, _l, tokens, join_char = '&'): ''' Builds a lambda function representing a predicate on a tree node from the conjunction of several other such lambda functions. This is prototypically called for expressions like (`tgrep_rel_conjunction`):: < NP & < AP < VP where tokens is a list of predicates representing the relations (`< NP`, `< AP`, and `< VP`), possibly with the character `&` included (as in the example here). This is also called for expressions like (`tgrep_node_expr2`):: NP < NN S=s < /NP/=n : s < /VP/=v : n .. v tokens[0] is a tgrep_expr predicate; tokens[1:] are an (optional) list of segmented patterns (`tgrep_expr_labeled`, processed by `_tgrep_segmented_pattern_action`). ''' # filter out the ampersand tokens = [x for x in tokens if x != join_char] # print 'relation conjunction tokens: ', tokens if len(tokens) == 1: return tokens[0] else: return (lambda ts: lambda n, m=None, l=None: all(predicate(n, m, l) for predicate in ts))(tokens) def _tgrep_segmented_pattern_action(_s, _l, tokens): ''' Builds a lambda function representing a segmented pattern. Called for expressions like (`tgrep_expr_labeled`):: =s .. =v < =n This is a segmented pattern, a tgrep2 expression which begins with a node label. The problem is that for segemented_pattern_action (': =v < =s'), the first element (in this case, =v) is specifically selected by virtue of matching a particular node in the tree; to retrieve the node, we need the label, not a lambda function. For node labels inside a tgrep_node_expr, we need a lambda function which returns true if the node visited is the same as =v. We solve this by creating two copies of a node_label_use in the grammar; the label use inside a tgrep_expr_labeled has a separate parse action to the pred use inside a node_expr. See `_tgrep_node_label_use_action` and `_tgrep_node_label_pred_use_action`. ''' # tokens[0] is a string containing the node label node_label = tokens[0] # tokens[1:] is an (optional) list of predicates which must all # hold of the bound node reln_preds = tokens[1:] def pattern_segment_pred(n, m=None, l=None): '''This predicate function ignores its node argument.''' # look up the bound node using its label if l is None or node_label not in l: raise TgrepException('node_label ={0} not bound in pattern'.format( node_label)) node = l[node_label] # match the relation predicates against the node return all(pred(node, m, l) for pred in reln_preds) return pattern_segment_pred def _tgrep_node_label_use_action(_s, _l, tokens): ''' Returns the node label used to begin a tgrep_expr_labeled. See `_tgrep_segmented_pattern_action`. Called for expressions like (`tgrep_node_label_use`):: =s when they appear as the first element of a `tgrep_expr_labeled` expression (see `_tgrep_segmented_pattern_action`). It returns the node label. ''' assert len(tokens) == 1 assert tokens[0].startswith('=') return tokens[0][1:] def _tgrep_node_label_pred_use_action(_s, _l, tokens): ''' Builds a lambda function representing a predicate on a tree node which describes the use of a previously bound node label. Called for expressions like (`tgrep_node_label_use_pred`):: =s when they appear inside a tgrep_node_expr (for example, inside a relation). The predicate returns true if and only if its node argument is identical the the node looked up in the node label dictionary using the node's label. ''' assert len(tokens) == 1 assert tokens[0].startswith('=') node_label = tokens[0][1:] def node_label_use_pred(n, m=None, l=None): # look up the bound node using its label if l is None or node_label not in l: raise TgrepException('node_label ={0} not bound in pattern'.format( node_label)) node = l[node_label] # truth means the given node is this node return n is node return node_label_use_pred def _tgrep_bind_node_label_action(_s, _l, tokens): ''' Builds a lambda function representing a predicate on a tree node which can optionally bind a matching node into the tgrep2 string's label_dict. Called for expressions like (`tgrep_node_expr2`):: /NP/ @NP=n ''' # tokens[0] is a tgrep_node_expr if len(tokens) == 1: return tokens[0] else: # if present, tokens[1] is the character '=', and tokens[2] is # a tgrep_node_label, a string value containing the node label assert len(tokens) == 3 assert tokens[1] == '=' node_pred = tokens[0] node_label = tokens[2] def node_label_bind_pred(n, m=None, l=None): if node_pred(n, m, l): # bind `n` into the dictionary `l` if l is None: raise TgrepException( 'cannot bind node_label {0}: label_dict is None'.format( node_label)) l[node_label] = n return True else: return False return node_label_bind_pred def _tgrep_rel_disjunction_action(_s, _l, tokens): ''' Builds a lambda function representing a predicate on a tree node from the disjunction of several other such lambda functions. ''' # filter out the pipe tokens = [x for x in tokens if x != '|'] # print 'relation disjunction tokens: ', tokens if len(tokens) == 1: return tokens[0] elif len(tokens) == 2: return (lambda a, b: lambda n, m=None, l=None: a(n, m, l) or b(n, m, l))(tokens[0], tokens[1]) def _macro_defn_action(_s, _l, tokens): ''' Builds a dictionary structure which defines the given macro. ''' assert len(tokens) == 3 assert tokens[0] == '@' return {tokens[1]: tokens[2]} def _tgrep_exprs_action(_s, _l, tokens): ''' This is the top-lebel node in a tgrep2 search string; the predicate function it returns binds together all the state of a tgrep2 search string. Builds a lambda function representing a predicate on a tree node from the disjunction of several tgrep expressions. Also handles macro definitions and macro name binding, and node label definitions and node label binding. ''' if len(tokens) == 1: return lambda n, m=None, l=None: tokens[0](n, None, {}) # filter out all the semicolons tokens = [x for x in tokens if x != ';'] # collect all macro definitions macro_dict = {} macro_defs = [tok for tok in tokens if isinstance(tok, dict)] for macro_def in macro_defs: macro_dict.update(macro_def) # collect all tgrep expressions tgrep_exprs = [tok for tok in tokens if not isinstance(tok, dict)] # create a new scope for the node label dictionary def top_level_pred(n, m=macro_dict, l=None): label_dict = {} # bind macro definitions and OR together all tgrep_exprs return any(predicate(n, m, label_dict) for predicate in tgrep_exprs) return top_level_pred def _build_tgrep_parser(set_parse_actions = True): ''' Builds a pyparsing-based parser object for tokenizing and interpreting tgrep search strings. ''' tgrep_op = (pyparsing.Optional('!') + pyparsing.Regex('[$%,.<>][%,.<>0-9-\':]*')) tgrep_qstring = pyparsing.QuotedString(quoteChar='"', escChar='\\', unquoteResults=False) tgrep_node_regex = pyparsing.QuotedString(quoteChar='/', escChar='\\', unquoteResults=False) tgrep_qstring_icase = pyparsing.Regex( 'i@\\"(?:[^"\\n\\r\\\\]|(?:\\\\.))*\\"') tgrep_node_regex_icase = pyparsing.Regex( 'i@\\/(?:[^/\\n\\r\\\\]|(?:\\\\.))*\\/') tgrep_node_literal = pyparsing.Regex('[^][ \r\t\n;:.,&|<>()$!@%\'^=]+') tgrep_expr = pyparsing.Forward() tgrep_relations = pyparsing.Forward() tgrep_parens = pyparsing.Literal('(') + tgrep_expr + ')' tgrep_nltk_tree_pos = ( pyparsing.Literal('N(') + pyparsing.Optional(pyparsing.Word(pyparsing.nums) + ',' + pyparsing.Optional(pyparsing.delimitedList( pyparsing.Word(pyparsing.nums), delim=',') + pyparsing.Optional(','))) + ')') tgrep_node_label = pyparsing.Regex('[A-Za-z0-9]+') tgrep_node_label_use = pyparsing.Combine('=' + tgrep_node_label) # see _tgrep_segmented_pattern_action tgrep_node_label_use_pred = tgrep_node_label_use.copy() macro_name = pyparsing.Regex('[^];:.,&|<>()[$!@%\'^=\r\t\n ]+') macro_name.setWhitespaceChars('') macro_use = pyparsing.Combine('@' + macro_name) tgrep_node_expr = (tgrep_node_label_use_pred | macro_use | tgrep_nltk_tree_pos | tgrep_qstring_icase | tgrep_node_regex_icase | tgrep_qstring | tgrep_node_regex | '*' | tgrep_node_literal) tgrep_node_expr2 = ((tgrep_node_expr + pyparsing.Literal('=').setWhitespaceChars('') + tgrep_node_label.copy().setWhitespaceChars('')) | tgrep_node_expr) tgrep_node = (tgrep_parens | (pyparsing.Optional("'") + tgrep_node_expr2 + pyparsing.ZeroOrMore("|" + tgrep_node_expr))) tgrep_brackets = pyparsing.Optional('!') + '[' + tgrep_relations + ']' tgrep_relation = tgrep_brackets | (tgrep_op + tgrep_node) tgrep_rel_conjunction = pyparsing.Forward() tgrep_rel_conjunction << (tgrep_relation + pyparsing.ZeroOrMore(pyparsing.Optional('&') + tgrep_rel_conjunction)) tgrep_relations << tgrep_rel_conjunction + pyparsing.ZeroOrMore( "|" + tgrep_relations) tgrep_expr << tgrep_node + pyparsing.Optional(tgrep_relations) tgrep_expr_labeled = tgrep_node_label_use + pyparsing.Optional(tgrep_relations) tgrep_expr2 = tgrep_expr + pyparsing.ZeroOrMore(':' + tgrep_expr_labeled) macro_defn = (pyparsing.Literal('@') + pyparsing.White().suppress() + macro_name + tgrep_expr2) tgrep_exprs = (pyparsing.Optional(macro_defn + pyparsing.ZeroOrMore(';' + macro_defn) + ';') + tgrep_expr2 + pyparsing.ZeroOrMore(';' + (macro_defn | tgrep_expr2)) + pyparsing.ZeroOrMore(';').suppress()) if set_parse_actions: tgrep_node_label_use.setParseAction(_tgrep_node_label_use_action) tgrep_node_label_use_pred.setParseAction(_tgrep_node_label_pred_use_action) macro_use.setParseAction(_tgrep_macro_use_action) tgrep_node.setParseAction(_tgrep_node_action) tgrep_node_expr2.setParseAction(_tgrep_bind_node_label_action) tgrep_parens.setParseAction(_tgrep_parens_action) tgrep_nltk_tree_pos.setParseAction(_tgrep_nltk_tree_pos_action) tgrep_relation.setParseAction(_tgrep_relation_action) tgrep_rel_conjunction.setParseAction(_tgrep_conjunction_action) tgrep_relations.setParseAction(_tgrep_rel_disjunction_action) macro_defn.setParseAction(_macro_defn_action) # the whole expression is also the conjunction of two # predicates: the first node predicate, and the remaining # relation predicates tgrep_expr.setParseAction(_tgrep_conjunction_action) tgrep_expr_labeled.setParseAction(_tgrep_segmented_pattern_action) tgrep_expr2.setParseAction(functools.partial(_tgrep_conjunction_action, join_char = ':')) tgrep_exprs.setParseAction(_tgrep_exprs_action) return tgrep_exprs.ignore('#' + pyparsing.restOfLine) def tgrep_tokenize(tgrep_string): ''' Tokenizes a TGrep search string into separate tokens. ''' parser = _build_tgrep_parser(False) if isinstance(tgrep_string, binary_type): tgrep_string = tgrep_string.decode() return list(parser.parseString(tgrep_string)) def tgrep_compile(tgrep_string): ''' Parses (and tokenizes, if necessary) a TGrep search string into a lambda function. ''' parser = _build_tgrep_parser(True) if isinstance(tgrep_string, binary_type): tgrep_string = tgrep_string.decode() return list(parser.parseString(tgrep_string, parseAll=True))[0] def treepositions_no_leaves(tree): ''' Returns all the tree positions in the given tree which are not leaf nodes. ''' treepositions = tree.treepositions() # leaves are treeposition tuples that are not prefixes of any # other treeposition prefixes = set() for pos in treepositions: for length in range(len(pos)): prefixes.add(pos[:length]) return [pos for pos in treepositions if pos in prefixes] def tgrep_positions(pattern, trees, search_leaves=True): """ Return the tree positions in the trees which match the given pattern. :param pattern: a tgrep search pattern :type pattern: str or output of tgrep_compile() :param trees: a sequence of NLTK trees (usually ParentedTrees) :type trees: iter(ParentedTree) or iter(Tree) :param search_leaves: whether ot return matching leaf nodes :type search_leaves: bool :rtype: iter(tree positions) """ if isinstance(pattern, (binary_type, text_type)): pattern = tgrep_compile(pattern) for tree in trees: try: if search_leaves: positions = tree.treepositions() else: positions = treepositions_no_leaves(tree) yield [position for position in positions if pattern(tree[position])] except AttributeError: yield [] def tgrep_nodes(pattern, trees, search_leaves=True): """ Return the tree nodes in the trees which match the given pattern. :param pattern: a tgrep search pattern :type pattern: str or output of tgrep_compile() :param trees: a sequence of NLTK trees (usually ParentedTrees) :type trees: iter(ParentedTree) or iter(Tree) :param search_leaves: whether ot return matching leaf nodes :type search_leaves: bool :rtype: iter(tree nodes) """ if isinstance(pattern, (binary_type, text_type)): pattern = tgrep_compile(pattern) for tree in trees: try: if search_leaves: positions = tree.treepositions() else: positions = treepositions_no_leaves(tree) yield [tree[position] for position in positions if pattern(tree[position])] except AttributeError: yield [] nltk-3.1/nltk/tokenize/0000755000076500000240000000000012610001541014642 5ustar sbstaff00000000000000nltk-3.1/nltk/tokenize/__init__.py0000644000076500000240000001053212607224144016770 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird (minor additions) # URL: # For license information, see LICENSE.TXT r""" NLTK Tokenizer Package Tokenizers divide strings into lists of substrings. For example, tokenizers can be used to find the words and punctuation in a string: >>> from nltk.tokenize import word_tokenize >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> word_tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] This particular tokenizer requires the Punkt sentence tokenization models to be installed. NLTK also provides a simpler, regular-expression based tokenizer, which splits text on whitespace and punctuation: >>> from nltk.tokenize import wordpunct_tokenize >>> wordpunct_tokenize(s) ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] We can also operate at the level of sentences, using the sentence tokenizer directly as follows: >>> from nltk.tokenize import sent_tokenize, word_tokenize >>> sent_tokenize(s) ['Good muffins cost $3.88\nin New York.', 'Please buy me\ntwo of them.', 'Thanks.'] >>> [word_tokenize(t) for t in sent_tokenize(s)] [['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.'], ['Please', 'buy', 'me', 'two', 'of', 'them', '.'], ['Thanks', '.']] Caution: when tokenizing a Unicode string, make sure you are not using an encoded version of the string (it may be necessary to decode it first, e.g. with ``s.decode("utf8")``. NLTK tokenizers can produce token-spans, represented as tuples of integers having the same semantics as string slices, to support efficient comparison of tokenizers. (These methods are implemented as generators.) >>> from nltk.tokenize import WhitespaceTokenizer >>> list(WhitespaceTokenizer().span_tokenize(s)) [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] There are numerous ways to tokenize text. If you need more control over tokenization, see the other methods provided in this package. For further information, please see Chapter 3 of the NLTK book. """ from nltk.data import load from nltk.tokenize.simple import (SpaceTokenizer, TabTokenizer, LineTokenizer, line_tokenize) from nltk.tokenize.regexp import (RegexpTokenizer, WhitespaceTokenizer, BlanklineTokenizer, WordPunctTokenizer, wordpunct_tokenize, regexp_tokenize, blankline_tokenize) from nltk.tokenize.punkt import PunktSentenceTokenizer from nltk.tokenize.sexpr import SExprTokenizer, sexpr_tokenize from nltk.tokenize.treebank import TreebankWordTokenizer from nltk.tokenize.stanford import StanfordTokenizer from nltk.tokenize.texttiling import TextTilingTokenizer from nltk.tokenize.casual import (TweetTokenizer, casual_tokenize) from nltk.tokenize.mwe import MWETokenizer # Standard sentence tokenizer. def sent_tokenize(text, language='english'): """ Return a sentence-tokenized copy of *text*, using NLTK's recommended sentence tokenizer (currently :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into sentences :param language: the model name in the Punkt corpus """ tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language)) return tokenizer.tokenize(text) # Standard word tokenizer. _treebank_word_tokenize = TreebankWordTokenizer().tokenize def word_tokenize(text, language='english'): """ Return a tokenized copy of *text*, using NLTK's recommended word tokenizer (currently :class:`.TreebankWordTokenizer` along with :class:`.PunktSentenceTokenizer` for the specified language). :param text: text to split into sentences :param language: the model name in the Punkt corpus """ return [token for sent in sent_tokenize(text, language) for token in _treebank_word_tokenize(sent)] nltk-3.1/nltk/tokenize/api.py0000644000076500000240000000374712607224144016014 0ustar sbstaff00000000000000# Natural Language Toolkit: Tokenizer Interface # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT """ Tokenizer Interface """ from nltk.internals import overridden from nltk.tokenize.util import string_span_tokenize class TokenizerI(object): """ A processing interface for tokenizing a string. Subclasses must define ``tokenize()`` or ``tokenize_sents()`` (or both). """ def tokenize(self, s): """ Return a tokenized copy of *s*. :rtype: list of str """ if overridden(self.tokenize_sents): return self.tokenize_sents([s])[0] else: raise NotImplementedError() def span_tokenize(self, s): """ Identify the tokens using integer offsets ``(start_i, end_i)``, where ``s[start_i:end_i]`` is the corresponding token. :rtype: iter(tuple(int, int)) """ raise NotImplementedError() def tokenize_sents(self, strings): """ Apply ``self.tokenize()`` to each element of ``strings``. I.e.: return [self.tokenize(s) for s in strings] :rtype: list(list(str)) """ return [self.tokenize(s) for s in strings] def span_tokenize_sents(self, strings): """ Apply ``self.span_tokenize()`` to each element of ``strings``. I.e.: return [self.span_tokenize(s) for s in strings] :rtype: iter(list(tuple(int, int))) """ for s in strings: yield list(self.span_tokenize(s)) class StringTokenizer(TokenizerI): """A tokenizer that divides a string into substrings by splitting on the specified string (defined in subclasses). """ def tokenize(self, s): return s.split(self._string) def span_tokenize(self, s): for span in string_span_tokenize(s, self._string): yield span nltk-3.1/nltk/tokenize/casual.py0000644000076500000240000003014312607224144016501 0ustar sbstaff00000000000000# coding: utf-8 # # Natural Language Toolkit: Twitter Tokenizer # # Copyright (C) 2001-2015 NLTK Project # Author: Christopher Potts # Ewan Klein (modifications) # Pierpaolo Pantone <> (modifications) # URL: # For license information, see LICENSE.TXT # """ Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this: 1. The tuple regex_strings defines a list of regular expression strings. 2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re. 3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of the class Tokenizer. 4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it is set to False, then the tokenizer will downcase everything except for emoticons. """ ###################################################################### from __future__ import unicode_literals import re from nltk.compat import htmlentitydefs, int2byte, unichr ###################################################################### # The following strings are components in the regular expression # that is used for tokenizing. It's important that phone_number # appears first in the final regex (since it can contain whitespace). # It also could matter that tags comes after emoticons, due to the # possibility of having text like # # <:| and some text >:) # # Most importantly, the final element should always be last, since it # does a last ditch whitespace-based tokenization of whatever is left. # ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ? # This particular element is used in a couple ways, so we define it # with a name: EMOTICONS = r""" (?: [<>]? [:;=8] # eyes [\-o\*\']? # optional nose [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth | [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth [\-o\*\']? # optional nose [:;=8] # eyes [<>]? | <3 # heart )""" # URL pattern due to John Gruber, modified by Tom Winzig. See # https://gist.github.com/winzig/8894715 URLS = r""" # Capture 1: entire matched URL (?: https?: # URL protocol and colon (?: /{1,3} # 1-3 slashes | # or [a-z0-9%] # Single letter or digit or '%' # (Trying not to match e.g. "URI::Escape") ) | # or # looks like domain name followed by a slash: [a-z0-9.\-]+[.] (?:[a-z]{2,13}) / ) (?: # One or more: [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[] | # or \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | \([^\s]+?\) # balanced parens, non-recursive: (...) )+ (?: # End with: \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) | \([^\s]+?\) # balanced parens, non-recursive: (...) | # or [^\s`!()\[\]{};:'".,<>?«»“â€â€˜â€™] # not a space or one of these punct chars ) | # OR, the following to match naked domains: (?: (?\s]+>""" , # ASCII Arrows r"""[\-]+>|<[\-]+""" , # Twitter username: r"""(?:@[\w_]+)""" , # Twitter hashtags: r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""" , # Remaining word types: r""" (?:[a-z][a-z'\-_]+[a-z]) # Words with apostrophes or dashes. | (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. | (?:[\w_]+) # Words without apostrophes or dashes. | (?:\.(?:\s*\.){1,}) # Ellipsis dots. | (?:\S) # Everything else that isn't whitespace. """ ) ###################################################################### # This is the core tokenizing regex: WORD_RE = re.compile(r"""(%s)""" % "|".join(REGEXPS), re.VERBOSE | re.I | re.UNICODE) # The emoticon string gets its own regex so that we can preserve case for # them as needed: EMOTICON_RE = re.compile(EMOTICONS, re.VERBOSE | re.I | re.UNICODE) # These are for regularizing HTML entities to Unicode: ENT_RE = re.compile(r'&(#?(x?))([^&;\s]+);') ###################################################################### # Functions for converting html entities ###################################################################### def _str_to_unicode(text, encoding=None, errors='strict'): if encoding is None: encoding = 'utf-8' if isinstance(text, bytes): return text.decode(encoding, errors) return text def _replace_html_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): """ Remove entities from text by converting them to their corresponding unicode character. :param text: a unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8'). :param list keep: list of entity names which should not be replaced.\ This supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``) and named entities (such as `` `` or ``>``). :param bool remove_illegal: If `True`, entities that can't be converted are\ removed. Otherwise, entities that can't be converted are kept "as is". :returns: A unicode string with the entities removed. See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py >>> from nltk.tokenize.casual import _replace_html_entities >>> _replace_html_entities(b'Price: £100') 'Price: \\xa3100' >>> print(_replace_html_entities(b'Price: £100')) Price: £100 >>> """ def _convert_entity(match): entity_body = match.group(3) if match.group(1): try: if match.group(2): number = int(entity_body, 16) else: number = int(entity_body, 10) # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML if 0x80 <= number <= 0x9f: return int2byte(number).decode('cp1252') except ValueError: number = None else: if entity_body in keep: return match.group(0) else: number = htmlentitydefs.name2codepoint.get(entity_body) if number is not None: try: return unichr(number) except ValueError: pass return "" if remove_illegal else match.group(0) return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding)) ###################################################################### class TweetTokenizer: r""" Tokenizer for tweets. >>> from nltk.tokenize import TweetTokenizer >>> tknzr = TweetTokenizer() >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" >>> tknzr.tokenize(s0) ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] >>> s1 = "@Joyster2012 @CathStaincliffe Good for you, girl!! Best wishes :-)" >>> tknzr.tokenize(s1) ['@Joyster2012', '@CathStaincliffe', 'Good', 'for', 'you', ',', 'girl', '!', '!', 'Best', 'wishes', ':-)'] >>> s2 = "3Points for #DreamTeam Gooo BAILEY! :) #PBB737Gold @PBBabscbn" >>> tknzr.tokenize(s2) ['3Points', 'for', '#DreamTeam', 'Gooo', 'BAILEY', '!', ':)', '#PBB737Gold', '@PBBabscbn'] >>> s3 = "@Insanomania They do... Their mentality doesn't :(" >>> tknzr.tokenize(s3) ['@Insanomania', 'They', 'do', '...', 'Their', 'mentality', "doesn't", ':('] >>> s4 = "RT @facugambande: Ya por arrancar a grabar !!! #TirenTirenTiren vamoo !!" >>> tknzr.tokenize(s4) ['RT', '@facugambande', ':', 'Ya', 'por', 'arrancar', 'a', 'grabar', '!', '!', '!', '#TirenTirenTiren', 'vamoo', '!', '!'] >>> tknzr = TweetTokenizer(reduce_len=True) >>> s5 = "@crushinghes the summer holidays are great but I'm so bored already :(" >>> tknzr.tokenize(s5) ['@crushinghes', 'the', 'summer', 'holidays', 'are', 'great', 'but', "I'm", 'so', 'bored', 'already', ':('] Examples using `strip_handles` and `reduce_len parameters`: >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) >>> s6 = '@remy: This is waaaaayyyy too much for you!!!!!!' >>> tknzr.tokenize(s6) [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] >>> s7 = '@_willy65: No place for @chuck tonight. Sorry.' >>> tknzr.tokenize(s7) [':', 'No', 'place', 'for', 'tonight', '.', 'Sorry', '.'] >>> s8 = '@mar_tin is a great developer. Contact him at mar_tin@email.com' >>> tknzr.tokenize(s8) ['is', 'a', 'great', 'developer', '.', 'Contact', 'him', 'at', 'mar_tin', '@email', '.', 'com'] """ def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False): self.preserve_case = preserve_case self.reduce_len = reduce_len self.strip_handles = strip_handles def tokenize(self, text): """ :param text: str :rtype: list(str) :return: a tokenized list of strings; concatenating this list returns\ the original string if `preserve_case=False` """ # Fix HTML character entities: text = _replace_html_entities(text) # Remove username handles if self.strip_handles: text = remove_handles(text) # Normalize word lengthening if self.reduce_len: text = reduce_lengthening(text) # Tokenize: words = WORD_RE.findall(text) # Possibly alter the case, but avoid changing emoticons like :D into :d: if not self.preserve_case: words = list(map((lambda x : x if EMOTICON_RE.search(x) else x.lower()), words)) return words ###################################################################### # Normalization Functions ###################################################################### def reduce_lengthening(text): """ Replace repeated character sequences of length 3 or greater with sequences of length 3. """ pattern = re.compile(r"(.)\1{2,}") return pattern.sub(r"\1\1\1", text) def remove_handles(text): """ Remove Twitter username handles from text. """ pattern = re.compile(r"(^|(?<=[^\w.-]))@[A-Za-z_]+\w+") return pattern.sub('', text) ###################################################################### # Tokenization Function ###################################################################### def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=False): """ Convenience function for wrapping the tokenizer. """ return TweetTokenizer(preserve_case=preserve_case, reduce_len=reduce_len, strip_handles=strip_handles).tokenize(text) ############################################################################### nltk-3.1/nltk/tokenize/mwe.py0000644000076500000240000000651012607224144016022 0ustar sbstaff00000000000000# Multi-Word Expression tokenizer # # Copyright (C) 2001-2015 NLTK Project # Author: Rob Malouf # URL: # For license information, see LICENSE.TXT """ Multi-Word Expression Tokenizer A ``MWETokenizer`` takes a string which has already been divided into tokens and retokenizes it, merging multi-word expressions into single tokens, using a lexicon of MWEs: >>> from nltk.tokenize import MWETokenizer >>> tokenizer = MWETokenizer([('a', 'little'), ('a', 'little', 'bit'), ('a', 'lot')]) >>> tokenizer.add_mwe(('in', 'spite', 'of')) >>> tokenizer.tokenize('Testing testing testing one two three'.split()) ['Testing', 'testing', 'testing', 'one', 'two', 'three'] >>> tokenizer.tokenize('This is a test in spite'.split()) ['This', 'is', 'a', 'test', 'in', 'spite'] >>> tokenizer.tokenize('In a little or a little bit or a lot in spite of'.split()) ['In', 'a_little', 'or', 'a_little_bit', 'or', 'a_lot', 'in_spite_of'] """ from nltk.tokenize.api import TokenizerI class MWETokenizer(TokenizerI): """ A tokenizer that processes tokenized text and merges multi-word expressions into single tokens: >>> tokenizer = MWETokenizer([('hors', "d'oeuvre")], separator='+') >>> tokenizer.tokenize("An hors d'oeuvre tonight, sir?".split()) ['An', "hors+d'oeuvre", 'tonight,', 'sir?'] :type mwes: list(list(str)) :param mwes: A sequence of multi-word expressions to be merged, where each MWE is a sequence of strings. :type separator: str :param separator: String that should be inserted between words in a multi-word expression token. """ def __init__(self, mwes=None, separator='_'): if not mwes: mwes = [] self._mwes = dict() self._separator = separator for mwe in mwes: self.add_mwe(mwe) def add_mwe(self, mwe, _trie=None): """ Add a multi-word expression to the lexicon (stored as a word trie) We represent the trie as a dict of dicts: >>> tokenizer = MWETokenizer([('a', 'b'), ('a', 'b', 'c'), ('a', 'x')]) >>> tokenizer._mwes {'a': {'x': {True: None}, 'b': {True: None, 'c': {True: None}}}} The key True marks the end of a valid MWE """ if _trie is None: _trie = self._mwes if mwe: if mwe[0] not in _trie: _trie[mwe[0]] = dict() self.add_mwe(mwe[1:], _trie=_trie[mwe[0]]) else: _trie[True] = None def tokenize(self, text): i = 0 n = len(text) result = [] while i < n: if text[i] in self._mwes: # possible MWE match j = i trie = self._mwes while j < n and text[j] in trie: trie = trie[text[j]] j = j + 1 else: if True in trie: # success! result.append(self._separator.join(text[i:j])) i = j else: # no match, so backtrack result.append(text[i]) i += 1 else: result.append(text[i]) i += 1 return result nltk-3.1/nltk/tokenize/punkt.py0000644000076500000240000017010512607224144016375 0ustar sbstaff00000000000000# Natural Language Toolkit: Punkt sentence tokenizer # # Copyright (C) 2001-2015 NLTK Project # Algorithm: Kiss & Strunk (2006) # Author: Willy (original Python port) # Steven Bird (additions) # Edward Loper (rewrite) # Joel Nothman (almost rewrite) # Arthur Darcet (fixes) # URL: # For license information, see LICENSE.TXT r""" Punkt Sentence Tokenizer This tokenizer divides a text into a list of sentences, by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences. It must be trained on a large collection of plaintext in the target language before it can be used. The NLTK data package includes a pre-trained Punkt tokenizer for English. >>> import nltk.data >>> text = ''' ... Punkt knows that the periods in Mr. Smith and Johann S. Bach ... do not mark sentence boundaries. And sometimes sentences ... can start with non-capitalized words. i is a good variable ... name. ... ''' >>> sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') >>> print('\n-----\n'.join(sent_detector.tokenize(text.strip()))) Punkt knows that the periods in Mr. Smith and Johann S. Bach do not mark sentence boundaries. ----- And sometimes sentences can start with non-capitalized words. ----- i is a good variable name. (Note that whitespace from the original text, including newlines, is retained in the output.) Punctuation following sentences is also included by default (from NLTK 3.0 onwards). It can be excluded with the realign_boundaries flag. >>> text = ''' ... (How does it deal with this parenthesis?) "It should be part of the ... previous sentence." "(And the same with this one.)" ('And this one!') ... "('(And (this)) '?)" [(and this. )] ... ''' >>> print('\n-----\n'.join( ... sent_detector.tokenize(text.strip()))) (How does it deal with this parenthesis?) ----- "It should be part of the previous sentence." ----- "(And the same with this one.)" ----- ('And this one!') ----- "('(And (this)) '?)" ----- [(and this. )] >>> print('\n-----\n'.join( ... sent_detector.tokenize(text.strip(), realign_boundaries=False))) (How does it deal with this parenthesis? ----- ) "It should be part of the previous sentence. ----- " "(And the same with this one. ----- )" ('And this one! ----- ') "('(And (this)) '? ----- )" [(and this. ----- )] However, Punkt is designed to learn parameters (a list of abbreviations, etc.) unsupervised from a corpus similar to the target domain. The pre-packaged models may therefore be unsuitable: use ``PunktSentenceTokenizer(text)`` to learn parameters from the given text. :class:`.PunktTrainer` learns parameters such as a list of abbreviations (without supervision) from portions of text. Using a ``PunktTrainer`` directly allows for incremental training and modification of the hyper-parameters used to decide what is considered an abbreviation, etc. The algorithm for this tokenizer is described in:: Kiss, Tibor and Strunk, Jan (2006): Unsupervised Multilingual Sentence Boundary Detection. Computational Linguistics 32: 485-525. """ from __future__ import print_function, unicode_literals # TODO: Make orthographic heuristic less susceptible to overtraining # TODO: Frequent sentence starters optionally exclude always-capitalised words # FIXME: Problem with ending string with e.g. '!!!' -> '!! !' import re import math from collections import defaultdict from nltk.compat import unicode_repr, python_2_unicode_compatible, string_types from nltk.probability import FreqDist from nltk.tokenize.api import TokenizerI ###################################################################### #{ Orthographic Context Constants ###################################################################### # The following constants are used to describe the orthographic # contexts in which a word can occur. BEG=beginning, MID=middle, # UNK=unknown, UC=uppercase, LC=lowercase, NC=no case. _ORTHO_BEG_UC = 1 << 1 """Orthographic context: beginning of a sentence with upper case.""" _ORTHO_MID_UC = 1 << 2 """Orthographic context: middle of a sentence with upper case.""" _ORTHO_UNK_UC = 1 << 3 """Orthographic context: unknown position in a sentence with upper case.""" _ORTHO_BEG_LC = 1 << 4 """Orthographic context: beginning of a sentence with lower case.""" _ORTHO_MID_LC = 1 << 5 """Orthographic context: middle of a sentence with lower case.""" _ORTHO_UNK_LC = 1 << 6 """Orthographic context: unknown position in a sentence with lower case.""" _ORTHO_UC = _ORTHO_BEG_UC + _ORTHO_MID_UC + _ORTHO_UNK_UC """Orthographic context: occurs with upper case.""" _ORTHO_LC = _ORTHO_BEG_LC + _ORTHO_MID_LC + _ORTHO_UNK_LC """Orthographic context: occurs with lower case.""" _ORTHO_MAP = { ('initial', 'upper'): _ORTHO_BEG_UC, ('internal', 'upper'): _ORTHO_MID_UC, ('unknown', 'upper'): _ORTHO_UNK_UC, ('initial', 'lower'): _ORTHO_BEG_LC, ('internal', 'lower'): _ORTHO_MID_LC, ('unknown', 'lower'): _ORTHO_UNK_LC, } """A map from context position and first-letter case to the appropriate orthographic context flag.""" #} (end orthographic context constants) ###################################################################### ###################################################################### #{ Decision reasons for debugging ###################################################################### REASON_DEFAULT_DECISION = 'default decision' REASON_KNOWN_COLLOCATION = 'known collocation (both words)' REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC = 'abbreviation + orthographic heuristic' REASON_ABBR_WITH_SENTENCE_STARTER = 'abbreviation + frequent sentence starter' REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic' REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC = 'initial + orthographic heuristic' REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC = 'initial + special orthographic heuristic' #} (end decision reasons for debugging) ###################################################################### ###################################################################### #{ Language-dependent variables ###################################################################### class PunktLanguageVars(object): """ Stores variables, mostly regular expressions, which may be language-dependent for correct application of the algorithm. An extension of this class may modify its properties to suit a language other than English; an instance can then be passed as an argument to PunktSentenceTokenizer and PunktTrainer constructors. """ __slots__ = ('_re_period_context', '_re_word_tokenizer') def __getstate__(self): # All modifications to the class are performed by inheritance. # Non-default parameters to be pickled must be defined in the inherited # class. return 1 def __setstate__(self, state): return 1 sent_end_chars = ('.', '?', '!') """Characters which are candidates for sentence boundaries""" @property def _re_sent_end_chars(self): return '[%s]' % re.escape(''.join(self.sent_end_chars)) internal_punctuation = ',:;' # might want to extend this.. """sentence internal punctuation, which indicates an abbreviation if preceded by a period-final token.""" re_boundary_realignment = re.compile(r'["\')\]}]+?(?:\s+|(?=--)|$)', re.MULTILINE) """Used to realign punctuation that should be included in a sentence although it follows the period (or ?, !).""" _re_word_start = r"[^\(\"\`{\[:;&\#\*@\)}\]\-,]" """Excludes some characters from starting word tokens""" _re_non_word_chars = r"(?:[?!)\";}\]\*:@\'\({\[])" """Characters that cannot appear within words""" _re_multi_char_punct = r"(?:\-{2,}|\.{2,}|(?:\.\s){2,}\.)" """Hyphen and ellipsis are multi-character punctuation""" _word_tokenize_fmt = r'''( %(MultiChar)s | (?=%(WordStart)s)\S+? # Accept word characters until end is found (?= # Sequences marking a word's end \s| # White-space $| # End-of-string %(NonWord)s|%(MultiChar)s| # Punctuation ,(?=$|\s|%(NonWord)s|%(MultiChar)s) # Comma if at end of word ) | \S )''' """Format of a regular expression to split punctuation from words, excluding period.""" def _word_tokenizer_re(self): """Compiles and returns a regular expression for word tokenization""" try: return self._re_word_tokenizer except AttributeError: self._re_word_tokenizer = re.compile( self._word_tokenize_fmt % { 'NonWord': self._re_non_word_chars, 'MultiChar': self._re_multi_char_punct, 'WordStart': self._re_word_start, }, re.UNICODE | re.VERBOSE ) return self._re_word_tokenizer def word_tokenize(self, s): """Tokenize a string to split off punctuation other than periods""" return self._word_tokenizer_re().findall(s) _period_context_fmt = r""" \S* # some word material %(SentEndChars)s # a potential sentence ending (?=(?P %(NonWord)s # either other punctuation | \s+(?P\S+) # or whitespace and some other token ))""" """Format of a regular expression to find contexts including possible sentence boundaries. Matches token which the possible sentence boundary ends, and matches the following token within a lookahead expression.""" def period_context_re(self): """Compiles and returns a regular expression to find contexts including possible sentence boundaries.""" try: return self._re_period_context except: self._re_period_context = re.compile( self._period_context_fmt % { 'NonWord': self._re_non_word_chars, 'SentEndChars': self._re_sent_end_chars, }, re.UNICODE | re.VERBOSE) return self._re_period_context _re_non_punct = re.compile(r'[^\W\d]', re.UNICODE) """Matches token types that are not merely punctuation. (Types for numeric tokens are changed to ##number## and hence contain alpha.)""" #} ###################################################################### #//////////////////////////////////////////////////////////// #{ Helper Functions #//////////////////////////////////////////////////////////// def _pair_iter(it): """ Yields pairs of tokens from the given iterator such that each input token will appear as the first element in a yielded tuple. The last pair will have None as its second element. """ it = iter(it) prev = next(it) for el in it: yield (prev, el) prev = el yield (prev, None) ###################################################################### #{ Punkt Parameters ###################################################################### class PunktParameters(object): """Stores data used to perform sentence boundary detection with Punkt.""" def __init__(self): self.abbrev_types = set() """A set of word types for known abbreviations.""" self.collocations = set() """A set of word type tuples for known common collocations where the first word ends in a period. E.g., ('S.', 'Bach') is a common collocation in a text that discusses 'Johann S. Bach'. These count as negative evidence for sentence boundaries.""" self.sent_starters = set() """A set of word types for words that often appear at the beginning of sentences.""" self.ortho_context = defaultdict(int) """A dictionary mapping word types to the set of orthographic contexts that word type appears in. Contexts are represented by adding orthographic context flags: ...""" def clear_abbrevs(self): self.abbrev_types = set() def clear_collocations(self): self.collocations = set() def clear_sent_starters(self): self.sent_starters = set() def clear_ortho_context(self): self.ortho_context = defaultdict(int) def add_ortho_context(self, typ, flag): self.ortho_context[typ] |= flag def _debug_ortho_context(self, typ): c = self.ortho_context[typ] if c & _ORTHO_BEG_UC: yield 'BEG-UC' if c & _ORTHO_MID_UC: yield 'MID-UC' if c & _ORTHO_UNK_UC: yield 'UNK-UC' if c & _ORTHO_BEG_LC: yield 'BEG-LC' if c & _ORTHO_MID_LC: yield 'MID-LC' if c & _ORTHO_UNK_LC: yield 'UNK-LC' ###################################################################### #{ PunktToken ###################################################################### @python_2_unicode_compatible class PunktToken(object): """Stores a token of text with annotations produced during sentence boundary detection.""" _properties = [ 'parastart', 'linestart', 'sentbreak', 'abbr', 'ellipsis' ] __slots__ = ['tok', 'type', 'period_final'] + _properties def __init__(self, tok, **params): self.tok = tok self.type = self._get_type(tok) self.period_final = tok.endswith('.') for p in self._properties: setattr(self, p, None) for k in params: setattr(self, k, params[k]) #//////////////////////////////////////////////////////////// #{ Regular expressions for properties #//////////////////////////////////////////////////////////// # Note: [A-Za-z] is approximated by [^\W\d] in the general case. _RE_ELLIPSIS = re.compile(r'\.\.+$') _RE_NUMERIC = re.compile(r'^-?[\.,]?\d[\d,\.-]*\.?$') _RE_INITIAL = re.compile(r'[^\W\d]\.$', re.UNICODE) _RE_ALPHA = re.compile(r'[^\W\d]+$', re.UNICODE) #//////////////////////////////////////////////////////////// #{ Derived properties #//////////////////////////////////////////////////////////// def _get_type(self, tok): """Returns a case-normalized representation of the token.""" return self._RE_NUMERIC.sub('##number##', tok.lower()) @property def type_no_period(self): """ The type with its final period removed if it has one. """ if len(self.type) > 1 and self.type[-1] == '.': return self.type[:-1] return self.type @property def type_no_sentperiod(self): """ The type with its final period removed if it is marked as a sentence break. """ if self.sentbreak: return self.type_no_period return self.type @property def first_upper(self): """True if the token's first character is uppercase.""" return self.tok[0].isupper() @property def first_lower(self): """True if the token's first character is lowercase.""" return self.tok[0].islower() @property def first_case(self): if self.first_lower: return 'lower' elif self.first_upper: return 'upper' return 'none' @property def is_ellipsis(self): """True if the token text is that of an ellipsis.""" return self._RE_ELLIPSIS.match(self.tok) @property def is_number(self): """True if the token text is that of a number.""" return self.type.startswith('##number##') @property def is_initial(self): """True if the token text is that of an initial.""" return self._RE_INITIAL.match(self.tok) @property def is_alpha(self): """True if the token text is all alphabetic.""" return self._RE_ALPHA.match(self.tok) @property def is_non_punct(self): """True if the token is either a number or is alphabetic.""" return _re_non_punct.search(self.type) #//////////////////////////////////////////////////////////// #{ String representation #//////////////////////////////////////////////////////////// def __repr__(self): """ A string representation of the token that can reproduce it with eval(), which lists all the token's non-default annotations. """ typestr = (' type=%s,' % unicode_repr(self.type) if self.type != self.tok else '') propvals = ', '.join( '%s=%s' % (p, unicode_repr(getattr(self, p))) for p in self._properties if getattr(self, p) ) return '%s(%s,%s %s)' % (self.__class__.__name__, unicode_repr(self.tok), typestr, propvals) def __str__(self): """ A string representation akin to that used by Kiss and Strunk. """ res = self.tok if self.abbr: res += '' if self.ellipsis: res += '' if self.sentbreak: res += '' return res ###################################################################### #{ Punkt base class ###################################################################### class PunktBaseClass(object): """ Includes common components of PunktTrainer and PunktSentenceTokenizer. """ def __init__(self, lang_vars=PunktLanguageVars(), token_cls=PunktToken, params=PunktParameters()): self._params = params self._lang_vars = lang_vars self._Token = token_cls """The collection of parameters that determines the behavior of the punkt tokenizer.""" #//////////////////////////////////////////////////////////// #{ Word tokenization #//////////////////////////////////////////////////////////// def _tokenize_words(self, plaintext): """ Divide the given text into tokens, using the punkt word segmentation regular expression, and generate the resulting list of tokens augmented as three-tuples with two boolean values for whether the given token occurs at the start of a paragraph or a new line, respectively. """ parastart = False for line in plaintext.split('\n'): if line.strip(): line_toks = iter(self._lang_vars.word_tokenize(line)) yield self._Token(next(line_toks), parastart=parastart, linestart=True) parastart = False for t in line_toks: yield self._Token(t) else: parastart = True #//////////////////////////////////////////////////////////// #{ Annotation Procedures #//////////////////////////////////////////////////////////// def _annotate_first_pass(self, tokens): """ Perform the first pass of annotation, which makes decisions based purely based on the word type of each word: - '?', '!', and '.' are marked as sentence breaks. - sequences of two or more periods are marked as ellipsis. - any word ending in '.' that's a known abbreviation is marked as an abbreviation. - any other word ending in '.' is marked as a sentence break. Return these annotations as a tuple of three sets: - sentbreak_toks: The indices of all sentence breaks. - abbrev_toks: The indices of all abbreviations. - ellipsis_toks: The indices of all ellipsis marks. """ for aug_tok in tokens: self._first_pass_annotation(aug_tok) yield aug_tok def _first_pass_annotation(self, aug_tok): """ Performs type-based annotation on a single token. """ tok = aug_tok.tok if tok in self._lang_vars.sent_end_chars: aug_tok.sentbreak = True elif aug_tok.is_ellipsis: aug_tok.ellipsis = True elif aug_tok.period_final and not tok.endswith('..'): if (tok[:-1].lower() in self._params.abbrev_types or tok[:-1].lower().split('-')[-1] in self._params.abbrev_types): aug_tok.abbr = True else: aug_tok.sentbreak = True return ###################################################################### #{ Punkt Trainer ###################################################################### class PunktTrainer(PunktBaseClass): """Learns parameters used in Punkt sentence boundary detection.""" def __init__(self, train_text=None, verbose=False, lang_vars=PunktLanguageVars(), token_cls=PunktToken): PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls) self._type_fdist = FreqDist() """A frequency distribution giving the frequency of each case-normalized token type in the training data.""" self._num_period_toks = 0 """The number of words ending in period in the training data.""" self._collocation_fdist = FreqDist() """A frequency distribution giving the frequency of all bigrams in the training data where the first word ends in a period. Bigrams are encoded as tuples of word types. Especially common collocations are extracted from this frequency distribution, and stored in ``_params``.``collocations ``.""" self._sent_starter_fdist = FreqDist() """A frequency distribution giving the frequency of all words that occur at the training data at the beginning of a sentence (after the first pass of annotation). Especially common sentence starters are extracted from this frequency distribution, and stored in ``_params.sent_starters``. """ self._sentbreak_count = 0 """The total number of sentence breaks identified in training, used for calculating the frequent sentence starter heuristic.""" self._finalized = True """A flag as to whether the training has been finalized by finding collocations and sentence starters, or whether finalize_training() still needs to be called.""" if train_text: self.train(train_text, verbose, finalize=True) def get_params(self): """ Calculates and returns parameters for sentence boundary detection as derived from training.""" if not self._finalized: self.finalize_training() return self._params #//////////////////////////////////////////////////////////// #{ Customization Variables #//////////////////////////////////////////////////////////// ABBREV = 0.3 """cut-off value whether a 'token' is an abbreviation""" IGNORE_ABBREV_PENALTY = False """allows the disabling of the abbreviation penalty heuristic, which exponentially disadvantages words that are found at times without a final period.""" ABBREV_BACKOFF = 5 """upper cut-off for Mikheev's(2002) abbreviation detection algorithm""" COLLOCATION = 7.88 """minimal log-likelihood value that two tokens need to be considered as a collocation""" SENT_STARTER = 30 """minimal log-likelihood value that a token requires to be considered as a frequent sentence starter""" INCLUDE_ALL_COLLOCS = False """this includes as potential collocations all word pairs where the first word ends in a period. It may be useful in corpora where there is a lot of variation that makes abbreviations like Mr difficult to identify.""" INCLUDE_ABBREV_COLLOCS = False """this includes as potential collocations all word pairs where the first word is an abbreviation. Such collocations override the orthographic heuristic, but not the sentence starter heuristic. This is overridden by INCLUDE_ALL_COLLOCS, and if both are false, only collocations with initials and ordinals are considered.""" """""" MIN_COLLOC_FREQ = 1 """this sets a minimum bound on the number of times a bigram needs to appear before it can be considered a collocation, in addition to log likelihood statistics. This is useful when INCLUDE_ALL_COLLOCS is True.""" #//////////////////////////////////////////////////////////// #{ Training.. #//////////////////////////////////////////////////////////// def train(self, text, verbose=False, finalize=True): """ Collects training data from a given text. If finalize is True, it will determine all the parameters for sentence boundary detection. If not, this will be delayed until get_params() or finalize_training() is called. If verbose is True, abbreviations found will be listed. """ # Break the text into tokens; record which token indices correspond to # line starts and paragraph starts; and determine their types. self._train_tokens(self._tokenize_words(text), verbose) if finalize: self.finalize_training(verbose) def train_tokens(self, tokens, verbose=False, finalize=True): """ Collects training data from a given list of tokens. """ self._train_tokens((self._Token(t) for t in tokens), verbose) if finalize: self.finalize_training(verbose) def _train_tokens(self, tokens, verbose): self._finalized = False # Ensure tokens are a list tokens = list(tokens) # Find the frequency of each case-normalized type. (Don't # strip off final periods.) Also keep track of the number of # tokens that end in periods. for aug_tok in tokens: self._type_fdist[aug_tok.type] += 1 if aug_tok.period_final: self._num_period_toks += 1 # Look for new abbreviations, and for types that no longer are unique_types = self._unique_types(tokens) for abbr, score, is_add in self._reclassify_abbrev_types(unique_types): if score >= self.ABBREV: if is_add: self._params.abbrev_types.add(abbr) if verbose: print((' Abbreviation: [%6.4f] %s' % (score, abbr))) else: if not is_add: self._params.abbrev_types.remove(abbr) if verbose: print((' Removed abbreviation: [%6.4f] %s' % (score, abbr))) # Make a preliminary pass through the document, marking likely # sentence breaks, abbreviations, and ellipsis tokens. tokens = list(self._annotate_first_pass(tokens)) # Check what contexts each word type can appear in, given the # case of its first letter. self._get_orthography_data(tokens) # We need total number of sentence breaks to find sentence starters self._sentbreak_count += self._get_sentbreak_count(tokens) # The remaining heuristics relate to pairs of tokens where the first # ends in a period. for aug_tok1, aug_tok2 in _pair_iter(tokens): if not aug_tok1.period_final or not aug_tok2: continue # Is the first token a rare abbreviation? if self._is_rare_abbrev_type(aug_tok1, aug_tok2): self._params.abbrev_types.add(aug_tok1.type_no_period) if verbose: print((' Rare Abbrev: %s' % aug_tok1.type)) # Does second token have a high likelihood of starting a sentence? if self._is_potential_sent_starter(aug_tok2, aug_tok1): self._sent_starter_fdist[aug_tok2.type] += 1 # Is this bigram a potential collocation? if self._is_potential_collocation(aug_tok1, aug_tok2): self._collocation_fdist[ (aug_tok1.type_no_period, aug_tok2.type_no_sentperiod)] += 1 def _unique_types(self, tokens): return set(aug_tok.type for aug_tok in tokens) def finalize_training(self, verbose=False): """ Uses data that has been gathered in training to determine likely collocations and sentence starters. """ self._params.clear_sent_starters() for typ, ll in self._find_sent_starters(): self._params.sent_starters.add(typ) if verbose: print((' Sent Starter: [%6.4f] %r' % (ll, typ))) self._params.clear_collocations() for (typ1, typ2), ll in self._find_collocations(): self._params.collocations.add( (typ1,typ2) ) if verbose: print((' Collocation: [%6.4f] %r+%r' % (ll, typ1, typ2))) self._finalized = True #//////////////////////////////////////////////////////////// #{ Overhead reduction #//////////////////////////////////////////////////////////// def freq_threshold(self, ortho_thresh=2, type_thresh=2, colloc_thres=2, sentstart_thresh=2): """ Allows memory use to be reduced after much training by removing data about rare tokens that are unlikely to have a statistical effect with further training. Entries occurring above the given thresholds will be retained. """ if ortho_thresh > 1: old_oc = self._params.ortho_context self._params.clear_ortho_context() for tok in self._type_fdist: count = self._type_fdist[tok] if count >= ortho_thresh: self._params.ortho_context[tok] = old_oc[tok] self._type_fdist = self._freq_threshold(self._type_fdist, type_thresh) self._collocation_fdist = self._freq_threshold( self._collocation_fdist, colloc_thres) self._sent_starter_fdist = self._freq_threshold( self._sent_starter_fdist, sentstart_thresh) def _freq_threshold(self, fdist, threshold): """ Returns a FreqDist containing only data with counts below a given threshold, as well as a mapping (None -> count_removed). """ # We assume that there is more data below the threshold than above it # and so create a new FreqDist rather than working in place. res = FreqDist() num_removed = 0 for tok in fdist: count = fdist[tok] if count < threshold: num_removed += 1 else: res[tok] += count res[None] += num_removed return res #//////////////////////////////////////////////////////////// #{ Orthographic data #//////////////////////////////////////////////////////////// def _get_orthography_data(self, tokens): """ Collect information about whether each token type occurs with different case patterns (i) overall, (ii) at sentence-initial positions, and (iii) at sentence-internal positions. """ # 'initial' or 'internal' or 'unknown' context = 'internal' tokens = list(tokens) for aug_tok in tokens: # If we encounter a paragraph break, then it's a good sign # that it's a sentence break. But err on the side of # caution (by not positing a sentence break) if we just # saw an abbreviation. if aug_tok.parastart and context != 'unknown': context = 'initial' # If we're at the beginning of a line, then we can't decide # between 'internal' and 'initial'. if aug_tok.linestart and context == 'internal': context = 'unknown' # Find the case-normalized type of the token. If it's a # sentence-final token, strip off the period. typ = aug_tok.type_no_sentperiod # Update the orthographic context table. flag = _ORTHO_MAP.get((context, aug_tok.first_case), 0) if flag: self._params.add_ortho_context(typ, flag) # Decide whether the next word is at a sentence boundary. if aug_tok.sentbreak: if not (aug_tok.is_number or aug_tok.is_initial): context = 'initial' else: context = 'unknown' elif aug_tok.ellipsis or aug_tok.abbr: context = 'unknown' else: context = 'internal' #//////////////////////////////////////////////////////////// #{ Abbreviations #//////////////////////////////////////////////////////////// def _reclassify_abbrev_types(self, types): """ (Re)classifies each given token if - it is period-final and not a known abbreviation; or - it is not period-final and is otherwise a known abbreviation by checking whether its previous classification still holds according to the heuristics of section 3. Yields triples (abbr, score, is_add) where abbr is the type in question, score is its log-likelihood with penalties applied, and is_add specifies whether the present type is a candidate for inclusion or exclusion as an abbreviation, such that: - (is_add and score >= 0.3) suggests a new abbreviation; and - (not is_add and score < 0.3) suggests excluding an abbreviation. """ # (While one could recalculate abbreviations from all .-final tokens at # every iteration, in cases requiring efficiency, the number of tokens # in the present training document will be much less.) for typ in types: # Check some basic conditions, to rule out words that are # clearly not abbrev_types. if not _re_non_punct.search(typ) or typ == '##number##': continue if typ.endswith('.'): if typ in self._params.abbrev_types: continue typ = typ[:-1] is_add = True else: if typ not in self._params.abbrev_types: continue is_add = False # Count how many periods & nonperiods are in the # candidate. num_periods = typ.count('.') + 1 num_nonperiods = len(typ) - num_periods + 1 # Let be the candidate without the period, and # be the period. Find a log likelihood ratio that # indicates whether occurs as a single unit (high # value of ll), or as two independent units and # (low value of ll). count_with_period = self._type_fdist[typ + '.'] count_without_period = self._type_fdist[typ] ll = self._dunning_log_likelihood( count_with_period + count_without_period, self._num_period_toks, count_with_period, self._type_fdist.N()) # Apply three scaling factors to 'tweak' the basic log # likelihood ratio: # F_length: long word -> less likely to be an abbrev # F_periods: more periods -> more likely to be an abbrev # F_penalty: penalize occurrences w/o a period f_length = math.exp(-num_nonperiods) f_periods = num_periods f_penalty = (int(self.IGNORE_ABBREV_PENALTY) or math.pow(num_nonperiods, -count_without_period)) score = ll * f_length * f_periods * f_penalty yield typ, score, is_add def find_abbrev_types(self): """ Recalculates abbreviations given type frequencies, despite no prior determination of abbreviations. This fails to include abbreviations otherwise found as "rare". """ self._params.clear_abbrevs() tokens = (typ for typ in self._type_fdist if typ and typ.endswith('.')) for abbr, score, is_add in self._reclassify_abbrev_types(tokens): if score >= self.ABBREV: self._params.abbrev_types.add(abbr) # This function combines the work done by the original code's # functions `count_orthography_context`, `get_orthography_count`, # and `get_rare_abbreviations`. def _is_rare_abbrev_type(self, cur_tok, next_tok): """ A word type is counted as a rare abbreviation if... - it's not already marked as an abbreviation - it occurs fewer than ABBREV_BACKOFF times - either it is followed by a sentence-internal punctuation mark, *or* it is followed by a lower-case word that sometimes appears with upper case, but never occurs with lower case at the beginning of sentences. """ if cur_tok.abbr or not cur_tok.sentbreak: return False # Find the case-normalized type of the token. If it's # a sentence-final token, strip off the period. typ = cur_tok.type_no_sentperiod # Proceed only if the type hasn't been categorized as an # abbreviation already, and is sufficiently rare... count = self._type_fdist[typ] + self._type_fdist[typ[:-1]] if (typ in self._params.abbrev_types or count >= self.ABBREV_BACKOFF): return False # Record this token as an abbreviation if the next # token is a sentence-internal punctuation mark. # [XX] :1 or check the whole thing?? if next_tok.tok[:1] in self._lang_vars.internal_punctuation: return True # Record this type as an abbreviation if the next # token... (i) starts with a lower case letter, # (ii) sometimes occurs with an uppercase letter, # and (iii) never occus with an uppercase letter # sentence-internally. # [xx] should the check for (ii) be modified?? elif next_tok.first_lower: typ2 = next_tok.type_no_sentperiod typ2ortho_context = self._params.ortho_context[typ2] if ( (typ2ortho_context & _ORTHO_BEG_UC) and not (typ2ortho_context & _ORTHO_MID_UC) ): return True #//////////////////////////////////////////////////////////// #{ Log Likelihoods #//////////////////////////////////////////////////////////// # helper for _reclassify_abbrev_types: @staticmethod def _dunning_log_likelihood(count_a, count_b, count_ab, N): """ A function that calculates the modified Dunning log-likelihood ratio scores for abbreviation candidates. The details of how this works is available in the paper. """ p1 = float(count_b) / N p2 = 0.99 null_hypo = (float(count_ab) * math.log(p1) + (count_a - count_ab) * math.log(1.0 - p1)) alt_hypo = (float(count_ab) * math.log(p2) + (count_a - count_ab) * math.log(1.0 - p2)) likelihood = null_hypo - alt_hypo return (-2.0 * likelihood) @staticmethod def _col_log_likelihood(count_a, count_b, count_ab, N): """ A function that will just compute log-likelihood estimate, in the original paper it's described in algorithm 6 and 7. This *should* be the original Dunning log-likelihood values, unlike the previous log_l function where it used modified Dunning log-likelihood values """ import math p = 1.0 * count_b / N p1 = 1.0 * count_ab / count_a p2 = 1.0 * (count_b - count_ab) / (N - count_a) summand1 = (count_ab * math.log(p) + (count_a - count_ab) * math.log(1.0 - p)) summand2 = ((count_b - count_ab) * math.log(p) + (N - count_a - count_b + count_ab) * math.log(1.0 - p)) if count_a == count_ab: summand3 = 0 else: summand3 = (count_ab * math.log(p1) + (count_a - count_ab) * math.log(1.0 - p1)) if count_b == count_ab: summand4 = 0 else: summand4 = ((count_b - count_ab) * math.log(p2) + (N - count_a - count_b + count_ab) * math.log(1.0 - p2)) likelihood = summand1 + summand2 - summand3 - summand4 return (-2.0 * likelihood) #//////////////////////////////////////////////////////////// #{ Collocation Finder #//////////////////////////////////////////////////////////// def _is_potential_collocation(self, aug_tok1, aug_tok2): """ Returns True if the pair of tokens may form a collocation given log-likelihood statistics. """ return ((self.INCLUDE_ALL_COLLOCS or (self.INCLUDE_ABBREV_COLLOCS and aug_tok1.abbr) or (aug_tok1.sentbreak and (aug_tok1.is_number or aug_tok1.is_initial))) and aug_tok1.is_non_punct and aug_tok2.is_non_punct) def _find_collocations(self): """ Generates likely collocations and their log-likelihood. """ for types in self._collocation_fdist: try: typ1, typ2 = types except TypeError: # types may be None after calling freq_threshold() continue if typ2 in self._params.sent_starters: continue col_count = self._collocation_fdist[types] typ1_count = self._type_fdist[typ1]+self._type_fdist[typ1+'.'] typ2_count = self._type_fdist[typ2]+self._type_fdist[typ2+'.'] if (typ1_count > 1 and typ2_count > 1 and self.MIN_COLLOC_FREQ < col_count <= min(typ1_count, typ2_count)): ll = self._col_log_likelihood(typ1_count, typ2_count, col_count, self._type_fdist.N()) # Filter out the not-so-collocative if (ll >= self.COLLOCATION and (float(self._type_fdist.N())/typ1_count > float(typ2_count)/col_count)): yield (typ1, typ2), ll #//////////////////////////////////////////////////////////// #{ Sentence-Starter Finder #//////////////////////////////////////////////////////////// def _is_potential_sent_starter(self, cur_tok, prev_tok): """ Returns True given a token and the token that preceds it if it seems clear that the token is beginning a sentence. """ # If a token (i) is preceded by a sentece break that is # not a potential ordinal number or initial, and (ii) is # alphabetic, then it is a a sentence-starter. return ( prev_tok.sentbreak and not (prev_tok.is_number or prev_tok.is_initial) and cur_tok.is_alpha ) def _find_sent_starters(self): """ Uses collocation heuristics for each candidate token to determine if it frequently starts sentences. """ for typ in self._sent_starter_fdist: if not typ: continue typ_at_break_count = self._sent_starter_fdist[typ] typ_count = self._type_fdist[typ]+self._type_fdist[typ+'.'] if typ_count < typ_at_break_count: # needed after freq_threshold continue ll = self._col_log_likelihood(self._sentbreak_count, typ_count, typ_at_break_count, self._type_fdist.N()) if (ll >= self.SENT_STARTER and float(self._type_fdist.N())/self._sentbreak_count > float(typ_count)/typ_at_break_count): yield typ, ll def _get_sentbreak_count(self, tokens): """ Returns the number of sentence breaks marked in a given set of augmented tokens. """ return sum(1 for aug_tok in tokens if aug_tok.sentbreak) ###################################################################### #{ Punkt Sentence Tokenizer ###################################################################### class PunktSentenceTokenizer(PunktBaseClass,TokenizerI): """ A sentence tokenizer which uses an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences; and then uses that model to find sentence boundaries. This approach has been shown to work well for many European languages. """ def __init__(self, train_text=None, verbose=False, lang_vars=PunktLanguageVars(), token_cls=PunktToken): """ train_text can either be the sole training text for this sentence boundary detector, or can be a PunktParameters object. """ PunktBaseClass.__init__(self, lang_vars=lang_vars, token_cls=token_cls) if train_text: self._params = self.train(train_text, verbose) def train(self, train_text, verbose=False): """ Derives parameters from a given training text, or uses the parameters given. Repeated calls to this method destroy previous parameters. For incremental training, instantiate a separate PunktTrainer instance. """ if not isinstance(train_text, string_types): return train_text return PunktTrainer(train_text, lang_vars=self._lang_vars, token_cls=self._Token).get_params() #//////////////////////////////////////////////////////////// #{ Tokenization #//////////////////////////////////////////////////////////// def tokenize(self, text, realign_boundaries=True): """ Given a text, returns a list of the sentences in that text. """ return list(self.sentences_from_text(text, realign_boundaries)) def debug_decisions(self, text): """ Classifies candidate periods as sentence breaks, yielding a dict for each that may be used to understand why the decision was made. See format_debug_decision() to help make this output readable. """ for match in self._lang_vars.period_context_re().finditer(text): decision_text = match.group() + match.group('after_tok') tokens = self._tokenize_words(decision_text) tokens = list(self._annotate_first_pass(tokens)) while not tokens[0].period_final: tokens.pop(0) yield dict(period_index=match.end() - 1, text=decision_text, type1=tokens[0].type, type2=tokens[1].type, type1_in_abbrs=bool(tokens[0].abbr), type1_is_initial=bool(tokens[0].is_initial), type2_is_sent_starter=tokens[1].type_no_sentperiod in self._params.sent_starters, type2_ortho_heuristic=self._ortho_heuristic(tokens[1]), type2_ortho_contexts=set(self._params._debug_ortho_context(tokens[1].type_no_sentperiod)), collocation=(tokens[0].type_no_sentperiod, tokens[1].type_no_sentperiod) in self._params.collocations, reason=self._second_pass_annotation(tokens[0], tokens[1]) or REASON_DEFAULT_DECISION, break_decision=tokens[0].sentbreak, ) def span_tokenize(self, text, realign_boundaries=True): """ Given a text, returns a list of the (start, end) spans of sentences in the text. """ slices = self._slices_from_text(text) if realign_boundaries: slices = self._realign_boundaries(text, slices) return [(sl.start, sl.stop) for sl in slices] def sentences_from_text(self, text, realign_boundaries=True): """ Given a text, generates the sentences in that text by only testing candidate sentence breaks. If realign_boundaries is True, includes in the sentence closing punctuation that follows the period. """ return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)] def _slices_from_text(self, text): last_break = 0 for match in self._lang_vars.period_context_re().finditer(text): context = match.group() + match.group('after_tok') if self.text_contains_sentbreak(context): yield slice(last_break, match.end()) if match.group('next_tok'): # next sentence starts after whitespace last_break = match.start('next_tok') else: # next sentence starts at following punctuation last_break = match.end() yield slice(last_break, len(text)) def _realign_boundaries(self, text, slices): """ Attempts to realign punctuation that falls after the period but should otherwise be included in the same sentence. For example: "(Sent1.) Sent2." will otherwise be split as:: ["(Sent1.", ") Sent1."]. This method will produce:: ["(Sent1.)", "Sent2."]. """ realign = 0 for sl1, sl2 in _pair_iter(slices): sl1 = slice(sl1.start + realign, sl1.stop) if not sl2: if text[sl1]: yield sl1 continue m = self._lang_vars.re_boundary_realignment.match(text[sl2]) if m: yield slice(sl1.start, sl2.start + len(m.group(0).rstrip())) realign = m.end() else: realign = 0 if text[sl1]: yield sl1 def text_contains_sentbreak(self, text): """ Returns True if the given text includes a sentence break. """ found = False # used to ignore last token for t in self._annotate_tokens(self._tokenize_words(text)): if found: return True if t.sentbreak: found = True return False def sentences_from_text_legacy(self, text): """ Given a text, generates the sentences in that text. Annotates all tokens, rather than just those with possible sentence breaks. Should produce the same results as ``sentences_from_text``. """ tokens = self._annotate_tokens(self._tokenize_words(text)) return self._build_sentence_list(text, tokens) def sentences_from_tokens(self, tokens): """ Given a sequence of tokens, generates lists of tokens, each list corresponding to a sentence. """ tokens = iter(self._annotate_tokens(self._Token(t) for t in tokens)) sentence = [] for aug_tok in tokens: sentence.append(aug_tok.tok) if aug_tok.sentbreak: yield sentence sentence = [] if sentence: yield sentence def _annotate_tokens(self, tokens): """ Given a set of tokens augmented with markers for line-start and paragraph-start, returns an iterator through those tokens with full annotation including predicted sentence breaks. """ # Make a preliminary pass through the document, marking likely # sentence breaks, abbreviations, and ellipsis tokens. tokens = self._annotate_first_pass(tokens) # Make a second pass through the document, using token context # information to change our preliminary decisions about where # sentence breaks, abbreviations, and ellipsis occurs. tokens = self._annotate_second_pass(tokens) ## [XX] TESTING #tokens = list(tokens) #self.dump(tokens) return tokens def _build_sentence_list(self, text, tokens): """ Given the original text and the list of augmented word tokens, construct and return a tokenized list of sentence strings. """ # Most of the work here is making sure that we put the right # pieces of whitespace back in all the right places. # Our position in the source text, used to keep track of which # whitespace to add: pos = 0 # A regular expression that finds pieces of whitespace: WS_REGEXP = re.compile(r'\s*') sentence = '' for aug_tok in tokens: tok = aug_tok.tok # Find the whitespace before this token, and update pos. ws = WS_REGEXP.match(text, pos).group() pos += len(ws) # Some of the rules used by the punkt word tokenizer # strip whitespace out of the text, resulting in tokens # that contain whitespace in the source text. If our # token doesn't match, see if adding whitespace helps. # If so, then use the version with whitespace. if text[pos:pos+len(tok)] != tok: pat = '\s*'.join(re.escape(c) for c in tok) m = re.compile(pat).match(text,pos) if m: tok = m.group() # Move our position pointer to the end of the token. assert text[pos:pos+len(tok)] == tok pos += len(tok) # Add this token. If it's not at the beginning of the # sentence, then include any whitespace that separated it # from the previous token. if sentence: sentence += ws sentence += tok # If we're at a sentence break, then start a new sentence. if aug_tok.sentbreak: yield sentence sentence = '' # If the last sentence is emtpy, discard it. if sentence: yield sentence # [XX] TESTING def dump(self, tokens): print('writing to /tmp/punkt.new...') with open('/tmp/punkt.new', 'w') as outfile: for aug_tok in tokens: if aug_tok.parastart: outfile.write('\n\n') elif aug_tok.linestart: outfile.write('\n') else: outfile.write(' ') outfile.write(str(aug_tok)) #//////////////////////////////////////////////////////////// #{ Customization Variables #//////////////////////////////////////////////////////////// PUNCTUATION = tuple(';:,.!?') #//////////////////////////////////////////////////////////// #{ Annotation Procedures #//////////////////////////////////////////////////////////// def _annotate_second_pass(self, tokens): """ Performs a token-based classification (section 4) over the given tokens, making use of the orthographic heuristic (4.1.1), collocation heuristic (4.1.2) and frequent sentence starter heuristic (4.1.3). """ for t1, t2 in _pair_iter(tokens): self._second_pass_annotation(t1, t2) yield t1 def _second_pass_annotation(self, aug_tok1, aug_tok2): """ Performs token-based classification over a pair of contiguous tokens updating the first. """ # Is it the last token? We can't do anything then. if not aug_tok2: return tok = aug_tok1.tok if not aug_tok1.period_final: # We only care about words ending in periods. return typ = aug_tok1.type_no_period next_tok = aug_tok2.tok next_typ = aug_tok2.type_no_sentperiod tok_is_initial = aug_tok1.is_initial # [4.1.2. Collocation Heuristic] If there's a # collocation between the word before and after the # period, then label tok as an abbreviation and NOT # a sentence break. Note that collocations with # frequent sentence starters as their second word are # excluded in training. if (typ, next_typ) in self._params.collocations: aug_tok1.sentbreak = False aug_tok1.abbr = True return REASON_KNOWN_COLLOCATION # [4.2. Token-Based Reclassification of Abbreviations] If # the token is an abbreviation or an ellipsis, then decide # whether we should *also* classify it as a sentbreak. if ( (aug_tok1.abbr or aug_tok1.ellipsis) and (not tok_is_initial) ): # [4.1.1. Orthographic Heuristic] Check if there's # orthogrpahic evidence about whether the next word # starts a sentence or not. is_sent_starter = self._ortho_heuristic(aug_tok2) if is_sent_starter == True: aug_tok1.sentbreak = True return REASON_ABBR_WITH_ORTHOGRAPHIC_HEURISTIC # [4.1.3. Frequent Sentence Starter Heruistic] If the # next word is capitalized, and is a member of the # frequent-sentence-starters list, then label tok as a # sentence break. if ( aug_tok2.first_upper and next_typ in self._params.sent_starters): aug_tok1.sentbreak = True return REASON_ABBR_WITH_SENTENCE_STARTER # [4.3. Token-Based Detection of Initials and Ordinals] # Check if any initials or ordinals tokens that are marked # as sentbreaks should be reclassified as abbreviations. if tok_is_initial or typ == '##number##': # [4.1.1. Orthographic Heuristic] Check if there's # orthogrpahic evidence about whether the next word # starts a sentence or not. is_sent_starter = self._ortho_heuristic(aug_tok2) if is_sent_starter == False: aug_tok1.sentbreak = False aug_tok1.abbr = True if tok_is_initial: return REASON_INITIAL_WITH_ORTHOGRAPHIC_HEURISTIC else: return REASON_NUMBER_WITH_ORTHOGRAPHIC_HEURISTIC # Special heuristic for initials: if orthogrpahic # heuristc is unknown, and next word is always # capitalized, then mark as abbrev (eg: J. Bach). if ( is_sent_starter == 'unknown' and tok_is_initial and aug_tok2.first_upper and not (self._params.ortho_context[next_typ] & _ORTHO_LC) ): aug_tok1.sentbreak = False aug_tok1.abbr = True return REASON_INITIAL_WITH_SPECIAL_ORTHOGRAPHIC_HEURISTIC return def _ortho_heuristic(self, aug_tok): """ Decide whether the given token is the first token in a sentence. """ # Sentences don't start with punctuation marks: if aug_tok.tok in self.PUNCTUATION: return False ortho_context = self._params.ortho_context[aug_tok.type_no_sentperiod] # If the word is capitalized, occurs at least once with a # lower case first letter, and never occurs with an upper case # first letter sentence-internally, then it's a sentence starter. if ( aug_tok.first_upper and (ortho_context & _ORTHO_LC) and not (ortho_context & _ORTHO_MID_UC) ): return True # If the word is lower case, and either (a) we've seen it used # with upper case, or (b) we've never seen it used # sentence-initially with lower case, then it's not a sentence # starter. if ( aug_tok.first_lower and ((ortho_context & _ORTHO_UC) or not (ortho_context & _ORTHO_BEG_LC)) ): return False # Otherwise, we're not sure. return 'unknown' DEBUG_DECISION_FMT = '''Text: %(text)r (at offset %(period_index)d) Sentence break? %(break_decision)s (%(reason)s) Collocation? %(collocation)s %(type1)r: known abbreviation: %(type1_in_abbrs)s is initial: %(type1_is_initial)s %(type2)r: known sentence starter: %(type2_is_sent_starter)s orthographic heuristic suggests is a sentence starter? %(type2_ortho_heuristic)s orthographic contexts in training: %(type2_ortho_contexts)s ''' def format_debug_decision(d): return DEBUG_DECISION_FMT % d def demo(text, tok_cls=PunktSentenceTokenizer, train_cls=PunktTrainer): """Builds a punkt model and applies it to the same text""" cleanup = lambda s: re.compile(r'(?:\r|^\s+)', re.MULTILINE).sub('', s).replace('\n', ' ') trainer = train_cls() trainer.INCLUDE_ALL_COLLOCS = True trainer.train(text) sbd = tok_cls(trainer.get_params()) for l in sbd.sentences_from_text(text): print(cleanup(l)) nltk-3.1/nltk/tokenize/regexp.py0000644000076500000240000001726412607224144016534 0ustar sbstaff00000000000000# Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # Trevor Cohn # URL: # For license information, see LICENSE.TXT r""" Regular-Expression Tokenizers A ``RegexpTokenizer`` splits a string into substrings using a regular expression. For example, the following tokenizer forms tokens out of alphabetic sequences, money expressions, and any other non-whitespace sequences: >>> from nltk.tokenize import RegexpTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') >>> tokenizer.tokenize(s) ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] A ``RegexpTokenizer`` can use its regexp to match delimiters instead: >>> tokenizer = RegexpTokenizer('\s+', gaps=True) >>> tokenizer.tokenize(s) ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] Note that empty tokens are not returned when the delimiter appears at the start or end of the string. The material between the tokens is discarded. For example, the following tokenizer selects just the capitalized words: >>> capword_tokenizer = RegexpTokenizer('[A-Z]\w+') >>> capword_tokenizer.tokenize(s) ['Good', 'New', 'York', 'Please', 'Thanks'] This module contains several subclasses of ``RegexpTokenizer`` that use pre-defined regular expressions. >>> from nltk.tokenize import BlanklineTokenizer >>> # Uses '\s*\n\s*\n\s*': >>> BlanklineTokenizer().tokenize(s) ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.'] All of the regular expression tokenizers are also available as functions: >>> from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize >>> regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+') ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] >>> wordpunct_tokenize(s) ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] >>> blankline_tokenize(s) ['Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.', 'Thanks.'] Caution: The function ``regexp_tokenize()`` takes the text as its first argument, and the regular expression pattern as its second argument. This differs from the conventions used by Python's ``re`` functions, where the pattern is always the first argument. (This is for consistency with the other NLTK tokenizers.) """ from __future__ import unicode_literals import re from nltk.tokenize.api import TokenizerI from nltk.tokenize.util import regexp_span_tokenize from nltk.compat import python_2_unicode_compatible @python_2_unicode_compatible class RegexpTokenizer(TokenizerI): """ A tokenizer that splits a string using a regular expression, which matches either the tokens or the separators between tokens. >>> tokenizer = RegexpTokenizer('\w+|\$[\d\.]+|\S+') :type pattern: str :param pattern: The pattern used to build this tokenizer. (This pattern may safely contain capturing parentheses.) :type gaps: bool :param gaps: True if this tokenizer's pattern should be used to find separators between tokens; False if this tokenizer's pattern should be used to find the tokens themselves. :type discard_empty: bool :param discard_empty: True if any empty tokens `''` generated by the tokenizer should be discarded. Empty tokens can only be generated if `_gaps == True`. :type flags: int :param flags: The regexp flags used to compile this tokenizer's pattern. By default, the following flags are used: `re.UNICODE | re.MULTILINE | re.DOTALL`. """ def __init__(self, pattern, gaps=False, discard_empty=True, flags=re.UNICODE | re.MULTILINE | re.DOTALL): # If they gave us a regexp object, extract the pattern. pattern = getattr(pattern, 'pattern', pattern) self._pattern = pattern self._gaps = gaps self._discard_empty = discard_empty self._flags = flags self._regexp = None def _check_regexp(self): if self._regexp is None: self._regexp = re.compile(self._pattern) def tokenize(self, text): self._check_regexp() # If our regexp matches gaps, use re.split: if self._gaps: if self._discard_empty: return [tok for tok in self._regexp.split(text) if tok] else: return self._regexp.split(text) # If our regexp matches tokens, use re.findall: else: return self._regexp.findall(text) def span_tokenize(self, text): self._check_regexp() if self._gaps: for left, right in regexp_span_tokenize(text, self._regexp): if not (self._discard_empty and left == right): yield left, right else: for m in re.finditer(self._regexp, text): yield m.span() def __repr__(self): return ('%s(pattern=%r, gaps=%r, discard_empty=%r, flags=%r)' % (self.__class__.__name__, self._pattern, self._gaps, self._discard_empty, self._flags)) class WhitespaceTokenizer(RegexpTokenizer): r""" Tokenize a string on whitespace (space, tab, newline). In general, users should use the string ``split()`` method instead. >>> from nltk.tokenize import WhitespaceTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> WhitespaceTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] """ def __init__(self): RegexpTokenizer.__init__(self, r'\s+', gaps=True) class BlanklineTokenizer(RegexpTokenizer): """ Tokenize a string, treating any sequence of blank lines as a delimiter. Blank lines are defined as lines containing no characters, except for space or tab characters. """ def __init__(self): RegexpTokenizer.__init__(self, r'\s*\n\s*\n\s*', gaps=True) class WordPunctTokenizer(RegexpTokenizer): """ Tokenize a text into a sequence of alphabetic and non-alphabetic characters, using the regexp ``\w+|[^\w\s]+``. >>> from nltk.tokenize import WordPunctTokenizer >>> s = "Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\n\\nThanks." >>> WordPunctTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] """ def __init__(self): RegexpTokenizer.__init__(self, r'\w+|[^\w\s]+') ###################################################################### #{ Tokenization Functions ###################################################################### def regexp_tokenize(text, pattern, gaps=False, discard_empty=True, flags=re.UNICODE | re.MULTILINE | re.DOTALL): """ Return a tokenized copy of *text*. See :class:`.RegexpTokenizer` for descriptions of the arguments. """ tokenizer = RegexpTokenizer(pattern, gaps, discard_empty, flags) return tokenizer.tokenize(text) blankline_tokenize = BlanklineTokenizer().tokenize wordpunct_tokenize = WordPunctTokenizer().tokenize nltk-3.1/nltk/tokenize/sexpr.py0000644000076500000240000001216512607224144016376 0ustar sbstaff00000000000000# Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2015 NLTK Project # Author: Yoav Goldberg # Steven Bird (minor edits) # URL: # For license information, see LICENSE.TXT """ S-Expression Tokenizer ``SExprTokenizer`` is used to find parenthesized expressions in a string. In particular, it divides a string into a sequence of substrings that are either parenthesized expressions (including any nested parenthesized expressions), or other whitespace-separated tokens. >>> from nltk.tokenize import SExprTokenizer >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') ['(a b (c d))', 'e', 'f', '(g)'] By default, `SExprTokenizer` will raise a ``ValueError`` exception if used to tokenize an expression with non-matching parentheses: >>> SExprTokenizer().tokenize('c) d) e (f (g') Traceback (most recent call last): ... ValueError: Un-matched close paren at char 1 The ``strict`` argument can be set to False to allow for non-matching parentheses. Any unmatched close parentheses will be listed as their own s-expression; and the last partial sexpr with unmatched open parentheses will be listed as its own sexpr: >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') ['c', ')', 'd', ')', 'e', '(f (g'] The characters used for open and close parentheses may be customized using the ``parens`` argument to the `SExprTokenizer` constructor: >>> SExprTokenizer(parens='{}').tokenize('{a b {c d}} e f {g}') ['{a b {c d}}', 'e', 'f', '{g}'] The s-expression tokenizer is also available as a function: >>> from nltk.tokenize import sexpr_tokenize >>> sexpr_tokenize('(a b (c d)) e f (g)') ['(a b (c d))', 'e', 'f', '(g)'] """ import re from nltk.tokenize.api import TokenizerI class SExprTokenizer(TokenizerI): """ A tokenizer that divides strings into s-expressions. An s-expresion can be either: - a parenthesized expression, including any nested parenthesized expressions, or - a sequence of non-whitespace non-parenthesis characters. For example, the string ``(a (b c)) d e (f)`` consists of four s-expressions: ``(a (b c))``, ``d``, ``e``, and ``(f)``. By default, the characters ``(`` and ``)`` are treated as open and close parentheses, but alternative strings may be specified. :param parens: A two-element sequence specifying the open and close parentheses that should be used to find sexprs. This will typically be either a two-character string, or a list of two strings. :type parens: str or list :param strict: If true, then raise an exception when tokenizing an ill-formed sexpr. """ def __init__(self, parens='()', strict=True): if len(parens) != 2: raise ValueError('parens must contain exactly two strings') self._strict = strict self._open_paren = parens[0] self._close_paren = parens[1] self._paren_regexp = re.compile('%s|%s' % (re.escape(parens[0]), re.escape(parens[1]))) def tokenize(self, text): """ Return a list of s-expressions extracted from *text*. For example: >>> SExprTokenizer().tokenize('(a b (c d)) e f (g)') ['(a b (c d))', 'e', 'f', '(g)'] All parentheses are assumed to mark s-expressions. (No special processing is done to exclude parentheses that occur inside strings, or following backslash characters.) If the given expression contains non-matching parentheses, then the behavior of the tokenizer depends on the ``strict`` parameter to the constructor. If ``strict`` is ``True``, then raise a ``ValueError``. If ``strict`` is ``False``, then any unmatched close parentheses will be listed as their own s-expression; and the last partial s-expression with unmatched open parentheses will be listed as its own s-expression: >>> SExprTokenizer(strict=False).tokenize('c) d) e (f (g') ['c', ')', 'd', ')', 'e', '(f (g'] :param text: the string to be tokenized :type text: str or iter(str) :rtype: iter(str) """ result = [] pos = 0 depth = 0 for m in self._paren_regexp.finditer(text): paren = m.group() if depth == 0: result += text[pos:m.start()].split() pos = m.start() if paren == self._open_paren: depth += 1 if paren == self._close_paren: if self._strict and depth == 0: raise ValueError('Un-matched close paren at char %d' % m.start()) depth = max(0, depth-1) if depth == 0: result.append(text[pos:m.end()]) pos = m.end() if self._strict and depth > 0: raise ValueError('Un-matched open paren at char %d' % pos) if pos < len(text): result.append(text[pos:]) return result sexpr_tokenize = SExprTokenizer().tokenize nltk-3.1/nltk/tokenize/simple.py0000644000076500000240000001206012607224144016520 0ustar sbstaff00000000000000# Natural Language Toolkit: Simple Tokenizers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # URL: # For license information, see LICENSE.TXT r""" Simple Tokenizers These tokenizers divide strings into substrings using the string ``split()`` method. When tokenizing using a particular delimiter string, use the string ``split()`` method directly, as this is more efficient. The simple tokenizers are *not* available as separate functions; instead, you should just use the string ``split()`` method directly: >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> s.split() ['Good', 'muffins', 'cost', '$3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.'] >>> s.split(' ') ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] >>> s.split('\n') ['Good muffins cost $3.88', 'in New York. Please buy me', 'two of them.', '', 'Thanks.'] The simple tokenizers are mainly useful because they follow the standard ``TokenizerI`` interface, and so can be used with any code that expects a tokenizer. For example, these tokenizers can be used to specify the tokenization conventions when building a `CorpusReader`. """ from __future__ import unicode_literals from nltk.tokenize.api import TokenizerI, StringTokenizer from nltk.tokenize.util import string_span_tokenize, regexp_span_tokenize class SpaceTokenizer(StringTokenizer): r"""Tokenize a string using the space character as a delimiter, which is the same as ``s.split(' ')``. >>> from nltk.tokenize import SpaceTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> SpaceTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$3.88\nin', 'New', 'York.', '', 'Please', 'buy', 'me\ntwo', 'of', 'them.\n\nThanks.'] """ _string = ' ' class TabTokenizer(StringTokenizer): r"""Tokenize a string use the tab character as a delimiter, the same as ``s.split('\t')``. >>> from nltk.tokenize import TabTokenizer >>> TabTokenizer().tokenize('a\tb c\n\t d') ['a', 'b c\n', ' d'] """ _string = '\t' class CharTokenizer(StringTokenizer): """Tokenize a string into individual characters. If this functionality is ever required directly, use ``for char in string``. """ def tokenize(self, s): return list(s) def span_tokenize(self, s): for i, j in enumerate(range(1, len(s) + 1)): yield i, j class LineTokenizer(TokenizerI): r"""Tokenize a string into its lines, optionally discarding blank lines. This is similar to ``s.split('\n')``. >>> from nltk.tokenize import LineTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." >>> LineTokenizer(blanklines='keep').tokenize(s) ['Good muffins cost $3.88', 'in New York. Please buy me', 'two of them.', '', 'Thanks.'] >>> # same as [l for l in s.split('\n') if l.strip()]: >>> LineTokenizer(blanklines='discard').tokenize(s) ['Good muffins cost $3.88', 'in New York. Please buy me', 'two of them.', 'Thanks.'] :param blanklines: Indicates how blank lines should be handled. Valid values are: - ``discard``: strip blank lines out of the token list before returning it. A line is considered blank if it contains only whitespace characters. - ``keep``: leave all blank lines in the token list. - ``discard-eof``: if the string ends with a newline, then do not generate a corresponding token ``''`` after that newline. """ def __init__(self, blanklines='discard'): valid_blanklines = ('discard', 'keep', 'discard-eof') if blanklines not in valid_blanklines: raise ValueError('Blank lines must be one of: %s' % ' '.join(valid_blanklines)) self._blanklines = blanklines def tokenize(self, s): lines = s.splitlines() # If requested, strip off blank lines. if self._blanklines == 'discard': lines = [l for l in lines if l.rstrip()] elif self._blanklines == 'discard-eof': if lines and not lines[-1].strip(): lines.pop() return lines # discard-eof not implemented def span_tokenize(self, s): if self._blanklines == 'keep': for span in string_span_tokenize(s, r'\n'): yield span else: for span in regexp_span_tokenize(s, r'\n(\s+\n)*'): yield span ###################################################################### #{ Tokenization Functions ###################################################################### # XXX: it is stated in module docs that there is no function versions def line_tokenize(text, blanklines='discard'): return LineTokenizer(blanklines).tokenize(text) nltk-3.1/nltk/tokenize/stanford.py0000644000076500000240000000705212607224144017054 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Interface to the Stanford Tokenizer # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Xu # # URL: # For license information, see LICENSE.TXT from __future__ import unicode_literals, print_function import tempfile import os import json from subprocess import PIPE from nltk import compat from nltk.internals import find_jar, config_java, java, _java_options from nltk.tokenize.api import TokenizerI _stanford_url = 'http://nlp.stanford.edu/software/tokenizer.shtml' class StanfordTokenizer(TokenizerI): r""" Interface to the Stanford Tokenizer >>> from nltk.tokenize import StanfordTokenizer >>> s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks." >>> StanfordTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.'] >>> s = "The colour of the wall is blue." >>> StanfordTokenizer(options={"americanize": True}).tokenize(s) ['The', 'color', 'of', 'the', 'wall', 'is', 'blue', '.'] """ _JAR = 'stanford-postagger.jar' def __init__(self, path_to_jar=None, encoding='utf8', options=None, verbose=False, java_options='-mx1000m'): self._stanford_jar = find_jar( self._JAR, path_to_jar, env_vars=('STANFORD_POSTAGGER',), searchpath=(), url=_stanford_url, verbose=verbose ) self._encoding = encoding self.java_options = java_options options = {} if options is None else options self._options_cmd = ','.join('{0}={1}'.format(key, val) for key, val in options.items()) @staticmethod def _parse_tokenized_output(s): return s.splitlines() def tokenize(self, s): """ Use stanford tokenizer's PTBTokenizer to tokenize multiple sentences. """ cmd = [ 'edu.stanford.nlp.process.PTBTokenizer', ] return self._parse_tokenized_output(self._execute(cmd, s)) def _execute(self, cmd, input_, verbose=False): encoding = self._encoding cmd.extend(['-charset', encoding]) _options_cmd = self._options_cmd if _options_cmd: cmd.extend(['-options', self._options_cmd]) default_options = ' '.join(_java_options) # Configure java. config_java(options=self.java_options, verbose=verbose) # Windows is incompatible with NamedTemporaryFile() without passing in delete=False. with tempfile.NamedTemporaryFile(mode='wb', delete=False) as input_file: # Write the actual sentences to the temporary input file if isinstance(input_, compat.text_type) and encoding: input_ = input_.encode(encoding) input_file.write(input_) input_file.flush() cmd.append(input_file.name) # Run the tagger and get the output. stdout, stderr = java(cmd, classpath=self._stanford_jar, stdout=PIPE, stderr=PIPE) stdout = stdout.decode(encoding) os.unlink(input_file.name) # Return java configurations to their default values. config_java(options=default_options, verbose=False) return stdout def setup_module(module): from nose import SkipTest try: StanfordTokenizer() except LookupError: raise SkipTest('doctests from nltk.tokenize.stanford are skipped because the stanford postagger jar doesn\'t exist') nltk-3.1/nltk/tokenize/texttiling.py0000644000076500000240000004072212607224144017430 0ustar sbstaff00000000000000# Natural Language Toolkit: TextTiling # # Copyright (C) 2001-2015 NLTK Project # Author: George Boutsioukis # # URL: # For license information, see LICENSE.TXT import re import math try: import numpy except ImportError: pass from nltk.tokenize.api import TokenizerI BLOCK_COMPARISON, VOCABULARY_INTRODUCTION = 0, 1 LC, HC = 0, 1 DEFAULT_SMOOTHING = [0] class TextTilingTokenizer(TokenizerI): """Tokenize a document into topical sections using the TextTiling algorithm. This algorithm detects subtopic shifts based on the analysis of lexical co-occurrence patterns. The process starts by tokenizing the text into pseudosentences of a fixed size w. Then, depending on the method used, similarity scores are assigned at sentence gaps. The algorithm proceeds by detecting the peak differences between these scores and marking them as boundaries. The boundaries are normalized to the closest paragraph break and the segmented text is returned. :param w: Pseudosentence size :type w: int :param k: Size (in sentences) of the block used in the block comparison method :type k: int :param similarity_method: The method used for determining similarity scores: `BLOCK_COMPARISON` (default) or `VOCABULARY_INTRODUCTION`. :type similarity_method: constant :param stopwords: A list of stopwords that are filtered out (defaults to NLTK's stopwords corpus) :type stopwords: list(str) :param smoothing_method: The method used for smoothing the score plot: `DEFAULT_SMOOTHING` (default) :type smoothing_method: constant :param smoothing_width: The width of the window used by the smoothing method :type smoothing_width: int :param smoothing_rounds: The number of smoothing passes :type smoothing_rounds: int :param cutoff_policy: The policy used to determine the number of boundaries: `HC` (default) or `LC` :type cutoff_policy: constant >>> from nltk.corpus import brown >>> tt = TextTilingTokenizer(demo_mode=True) >>> text = brown.raw()[:10000] >>> s, ss, d, b = tt.tokenize(text) >>> b [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0] """ def __init__(self, w=20, k=10, similarity_method=BLOCK_COMPARISON, stopwords=None, smoothing_method=DEFAULT_SMOOTHING, smoothing_width=2, smoothing_rounds=1, cutoff_policy=HC, demo_mode=False): if stopwords is None: from nltk.corpus import stopwords stopwords = stopwords.words('english') self.__dict__.update(locals()) del self.__dict__['self'] def tokenize(self, text): """Return a tokenized copy of *text*, where each "token" represents a separate topic.""" lowercase_text = text.lower() paragraph_breaks = self._mark_paragraph_breaks(text) text_length = len(lowercase_text) # Tokenization step starts here # Remove punctuation nopunct_text = ''.join(c for c in lowercase_text if re.match("[a-z\-\' \n\t]", c)) nopunct_par_breaks = self._mark_paragraph_breaks(nopunct_text) tokseqs = self._divide_to_tokensequences(nopunct_text) # The morphological stemming step mentioned in the TextTile # paper is not implemented. A comment in the original C # implementation states that it offers no benefit to the # process. It might be interesting to test the existing # stemmers though. #words = _stem_words(words) # Filter stopwords for ts in tokseqs: ts.wrdindex_list = [wi for wi in ts.wrdindex_list if wi[0] not in self.stopwords] token_table = self._create_token_table(tokseqs, nopunct_par_breaks) # End of the Tokenization step # Lexical score determination if self.similarity_method == BLOCK_COMPARISON: gap_scores = self._block_comparison(tokseqs, token_table) elif self.similarity_method == VOCABULARY_INTRODUCTION: raise NotImplementedError("Vocabulary introduction not implemented") if self.smoothing_method == DEFAULT_SMOOTHING: smooth_scores = self._smooth_scores(gap_scores) # End of Lexical score Determination # Boundary identification depth_scores = self._depth_scores(smooth_scores) segment_boundaries = self._identify_boundaries(depth_scores) normalized_boundaries = self._normalize_boundaries(text, segment_boundaries, paragraph_breaks) # End of Boundary Identification segmented_text = [] prevb = 0 for b in normalized_boundaries: if b == 0: continue segmented_text.append(text[prevb:b]) prevb = b if prevb < text_length: # append any text that may be remaining segmented_text.append(text[prevb:]) if not segmented_text: segmented_text = [text] if self.demo_mode: return gap_scores, smooth_scores, depth_scores, segment_boundaries return segmented_text def _block_comparison(self, tokseqs, token_table): "Implements the block comparison method" def blk_frq(tok, block): ts_occs = filter(lambda o: o[0] in block, token_table[tok].ts_occurences) freq = sum([tsocc[1] for tsocc in ts_occs]) return freq gap_scores = [] numgaps = len(tokseqs)-1 for curr_gap in range(numgaps): score_dividend, score_divisor_b1, score_divisor_b2 = 0.0, 0.0, 0.0 score = 0.0 #adjust window size for boundary conditions if curr_gap < self.k-1: window_size = curr_gap + 1 elif curr_gap > numgaps-self.k: window_size = numgaps - curr_gap else: window_size = self.k b1 = [ts.index for ts in tokseqs[curr_gap-window_size+1 : curr_gap+1]] b2 = [ts.index for ts in tokseqs[curr_gap+1 : curr_gap+window_size+1]] for t in token_table: score_dividend += blk_frq(t, b1)*blk_frq(t, b2) score_divisor_b1 += blk_frq(t, b1)**2 score_divisor_b2 += blk_frq(t, b2)**2 try: score = score_dividend/math.sqrt(score_divisor_b1* score_divisor_b2) except ZeroDivisionError: pass # score += 0.0 gap_scores.append(score) return gap_scores def _smooth_scores(self, gap_scores): "Wraps the smooth function from the SciPy Cookbook" return list(smooth(numpy.array(gap_scores[:]), window_len = self.smoothing_width+1)) def _mark_paragraph_breaks(self, text): """Identifies indented text or line breaks as the beginning of paragraphs""" MIN_PARAGRAPH = 100 pattern = re.compile("[ \t\r\f\v]*\n[ \t\r\f\v]*\n[ \t\r\f\v]*") matches = pattern.finditer(text) last_break = 0 pbreaks = [0] for pb in matches: if pb.start()-last_break < MIN_PARAGRAPH: continue else: pbreaks.append(pb.start()) last_break = pb.start() return pbreaks def _divide_to_tokensequences(self, text): "Divides the text into pseudosentences of fixed size" w = self.w wrdindex_list = [] matches = re.finditer("\w+", text) for match in matches: wrdindex_list.append((match.group(), match.start())) return [TokenSequence(i/w, wrdindex_list[i:i+w]) for i in range(0, len(wrdindex_list), w)] def _create_token_table(self, token_sequences, par_breaks): "Creates a table of TokenTableFields" token_table = {} current_par = 0 current_tok_seq = 0 pb_iter = par_breaks.__iter__() current_par_break = next(pb_iter) if current_par_break == 0: try: current_par_break = next(pb_iter) #skip break at 0 except StopIteration: raise ValueError( "No paragraph breaks were found(text too short perhaps?)" ) for ts in token_sequences: for word, index in ts.wrdindex_list: try: while index > current_par_break: current_par_break = next(pb_iter) current_par += 1 except StopIteration: #hit bottom pass if word in token_table: token_table[word].total_count += 1 if token_table[word].last_par != current_par: token_table[word].last_par = current_par token_table[word].par_count += 1 if token_table[word].last_tok_seq != current_tok_seq: token_table[word].last_tok_seq = current_tok_seq token_table[word]\ .ts_occurences.append([current_tok_seq,1]) else: token_table[word].ts_occurences[-1][1] += 1 else: #new word token_table[word] = TokenTableField(first_pos=index, ts_occurences= \ [[current_tok_seq,1]], total_count=1, par_count=1, last_par=current_par, last_tok_seq= \ current_tok_seq) current_tok_seq += 1 return token_table def _identify_boundaries(self, depth_scores): """Identifies boundaries at the peaks of similarity score differences""" boundaries = [0 for x in depth_scores] avg = sum(depth_scores)/len(depth_scores) stdev = numpy.std(depth_scores) #SB: what is the purpose of this conditional? if self.cutoff_policy == LC: cutoff = avg-stdev/2.0 else: cutoff = avg-stdev/2.0 depth_tuples = sorted(zip(depth_scores, range(len(depth_scores)))) depth_tuples.reverse() hp = list(filter(lambda x:x[0]>cutoff, depth_tuples)) for dt in hp: boundaries[dt[1]] = 1 for dt2 in hp: #undo if there is a boundary close already if dt[1] != dt2[1] and abs(dt2[1]-dt[1]) < 4 \ and boundaries[dt2[1]] == 1: boundaries[dt[1]] = 0 return boundaries def _depth_scores(self, scores): """Calculates the depth of each gap, i.e. the average difference between the left and right peaks and the gap's score""" depth_scores = [0 for x in scores] #clip boundaries: this holds on the rule of thumb(my thumb) #that a section shouldn't be smaller than at least 2 #pseudosentences for small texts and around 5 for larger ones. clip = min(max(len(scores)/10, 2), 5) index = clip for gapscore in scores[clip:-clip]: lpeak = gapscore for score in scores[index::-1]: if score >= lpeak: lpeak = score else: break rpeak = gapscore for score in scores[index:]: if score >= rpeak: rpeak = score else: break depth_scores[index] = lpeak + rpeak - 2 * gapscore index += 1 return depth_scores def _normalize_boundaries(self, text, boundaries, paragraph_breaks): """Normalize the boundaries identified to the original text's paragraph breaks""" norm_boundaries = [] char_count, word_count, gaps_seen = 0, 0, 0 seen_word = False for char in text: char_count += 1 if char in " \t\n" and seen_word: seen_word = False word_count += 1 if char not in " \t\n" and not seen_word: seen_word=True if gaps_seen < len(boundaries) and word_count > \ (max(gaps_seen*self.w, self.w)): if boundaries[gaps_seen] == 1: #find closest paragraph break best_fit = len(text) for br in paragraph_breaks: if best_fit > abs(br-char_count): best_fit = abs(br-char_count) bestbr = br else: break if bestbr not in norm_boundaries: #avoid duplicates norm_boundaries.append(bestbr) gaps_seen += 1 return norm_boundaries class TokenTableField(object): """A field in the token table holding parameters for each token, used later in the process""" def __init__(self, first_pos, ts_occurences, total_count=1, par_count=1, last_par=0, last_tok_seq=None): self.__dict__.update(locals()) del self.__dict__['self'] class TokenSequence(object): "A token list with its original length and its index" def __init__(self, index, wrdindex_list, original_length=None): original_length=original_length or len(wrdindex_list) self.__dict__.update(locals()) del self.__dict__['self'] #Pasted from the SciPy cookbook: http://www.scipy.org/Cookbook/SignalSmooth def smooth(x,window_len=11,window='flat'): """smooth the data using a window with requested size. This method is based on the convolution of a scaled window with the signal. The signal is prepared by introducing reflected copies of the signal (with the window size) in both ends so that transient parts are minimized in the beginning and end part of the output signal. :param x: the input signal :param window_len: the dimension of the smoothing window; should be an odd integer :param window: the type of window from 'flat', 'hanning', 'hamming', 'bartlett', 'blackman' flat window will produce a moving average smoothing. :return: the smoothed signal example:: t=linspace(-2,2,0.1) x=sin(t)+randn(len(t))*0.1 y=smooth(x) :see also: numpy.hanning, numpy.hamming, numpy.bartlett, numpy.blackman, numpy.convolve, scipy.signal.lfilter TODO: the window parameter could be the window itself if an array instead of a string """ if x.ndim != 1: raise ValueError("smooth only accepts 1 dimension arrays.") if x.size < window_len: raise ValueError("Input vector needs to be bigger than window size.") if window_len < 3: return x if not window in ['flat', 'hanning', 'hamming', 'bartlett', 'blackman']: raise ValueError("Window is on of 'flat', 'hanning', 'hamming', 'bartlett', 'blackman'") s=numpy.r_[2*x[0]-x[window_len:1:-1],x,2*x[-1]-x[-1:-window_len:-1]] #print(len(s)) if window == 'flat': #moving average w = numpy.ones(window_len,'d') else: w = eval('numpy.' + window + '(window_len)') y = numpy.convolve(w/w.sum(), s, mode='same') return y[window_len-1:-window_len+1] def demo(text=None): from nltk.corpus import brown from matplotlib import pylab tt = TextTilingTokenizer(demo_mode=True) if text is None: text = brown.raw()[:10000] s, ss, d, b = tt.tokenize(text) pylab.xlabel("Sentence Gap index") pylab.ylabel("Gap Scores") pylab.plot(range(len(s)), s, label="Gap Scores") pylab.plot(range(len(ss)), ss, label="Smoothed Gap scores") pylab.plot(range(len(d)), d, label="Depth scores") pylab.stem(range(len(b)), b) pylab.legend() pylab.show() nltk-3.1/nltk/tokenize/treebank.py0000644000076500000240000001016712607224144017030 0ustar sbstaff00000000000000# Natural Language Toolkit: Tokenizers # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Michael Heilman (re-port from http://www.cis.upenn.edu/~treebank/tokenizer.sed) # # URL: # For license information, see LICENSE.TXT r""" Penn Treebank Tokenizer The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. This implementation is a port of the tokenizer sed script written by Robert McIntyre and available at http://www.cis.upenn.edu/~treebank/tokenizer.sed. """ import re from nltk.tokenize.api import TokenizerI class TreebankWordTokenizer(TokenizerI): """ The Treebank tokenizer uses regular expressions to tokenize text as in Penn Treebank. This is the method that is invoked by ``word_tokenize()``. It assumes that the text has already been segmented into sentences, e.g. using ``sent_tokenize()``. This tokenizer performs the following steps: - split standard contractions, e.g. ``don't`` -> ``do n't`` and ``they'll`` -> ``they 'll`` - treat most punctuation characters as separate tokens - split off commas and single quotes, when followed by whitespace - separate periods that appear at the end of line >>> from nltk.tokenize import TreebankWordTokenizer >>> s = '''Good muffins cost $3.88\\nin New York. Please buy me\\ntwo of them.\\nThanks.''' >>> TreebankWordTokenizer().tokenize(s) ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] >>> s = "They'll save and invest more." >>> TreebankWordTokenizer().tokenize(s) ['They', "'ll", 'save', 'and', 'invest', 'more', '.'] >>> s = "hi, my name can't hello," >>> TreebankWordTokenizer().tokenize(s) ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ','] """ # List of contractions adapted from Robert MacIntyre's tokenizer. CONTRACTIONS2 = [re.compile(r"(?i)\b(can)(not)\b"), re.compile(r"(?i)\b(d)('ye)\b"), re.compile(r"(?i)\b(gim)(me)\b"), re.compile(r"(?i)\b(gon)(na)\b"), re.compile(r"(?i)\b(got)(ta)\b"), re.compile(r"(?i)\b(lem)(me)\b"), re.compile(r"(?i)\b(mor)('n)\b"), re.compile(r"(?i)\b(wan)(na) ")] CONTRACTIONS3 = [re.compile(r"(?i) ('t)(is)\b"), re.compile(r"(?i) ('t)(was)\b")] CONTRACTIONS4 = [re.compile(r"(?i)\b(whad)(dd)(ya)\b"), re.compile(r"(?i)\b(wha)(t)(cha)\b")] def tokenize(self, text): #starting quotes text = re.sub(r'^\"', r'``', text) text = re.sub(r'(``)', r' \1 ', text) text = re.sub(r'([ (\[{<])"', r'\1 `` ', text) #punctuation text = re.sub(r'([:,])([^\d])', r' \1 \2', text) text = re.sub(r'([:,])$', r' \1 ', text) text = re.sub(r'\.\.\.', r' ... ', text) text = re.sub(r'[;@#$%&]', r' \g<0> ', text) text = re.sub(r'([^\.])(\.)([\]\)}>"\']*)\s*$', r'\1 \2\3 ', text) text = re.sub(r'[?!]', r' \g<0> ', text) text = re.sub(r"([^'])' ", r"\1 ' ", text) #parens, brackets, etc. text = re.sub(r'[\]\[\(\)\{\}\<\>]', r' \g<0> ', text) text = re.sub(r'--', r' -- ', text) #add extra space to make things easier text = " " + text + " " #ending quotes text = re.sub(r'"', " '' ", text) text = re.sub(r'(\S)(\'\')', r'\1 \2 ', text) text = re.sub(r"([^' ])('[sS]|'[mM]|'[dD]|') ", r"\1 \2 ", text) text = re.sub(r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) ", r"\1 \2 ", text) for regexp in self.CONTRACTIONS2: text = regexp.sub(r' \1 \2 ', text) for regexp in self.CONTRACTIONS3: text = regexp.sub(r' \1 \2 ', text) # We are not using CONTRACTIONS4 since # they are also commented out in the SED scripts # for regexp in self.CONTRACTIONS4: # text = regexp.sub(r' \1 \2 \3 ', text) return text.split() nltk-3.1/nltk/tokenize/util.py0000644000076500000240000000576012607224144016215 0ustar sbstaff00000000000000# Natural Language Toolkit: Tokenizer Utilities # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT from re import finditer def string_span_tokenize(s, sep): r""" Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` tuples, by splitting the string at each occurrence of *sep*. >>> from nltk.tokenize.util import string_span_tokenize >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> list(string_span_tokenize(s, " ")) [(0, 4), (5, 12), (13, 17), (18, 26), (27, 30), (31, 36), (37, 37), (38, 44), (45, 48), (49, 55), (56, 58), (59, 73)] :param s: the string to be tokenized :type s: str :param sep: the token separator :type sep: str :rtype: iter(tuple(int, int)) """ if len(sep) == 0: raise ValueError("Token delimiter must not be empty") left = 0 while True: try: right = s.index(sep, left) if right != 0: yield left, right except ValueError: if left != len(s): yield left, len(s) break left = right + len(sep) def regexp_span_tokenize(s, regexp): r""" Return the offsets of the tokens in *s*, as a sequence of ``(start, end)`` tuples, by splitting the string at each successive match of *regexp*. >>> from nltk.tokenize import WhitespaceTokenizer >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> list(WhitespaceTokenizer().span_tokenize(s)) [(0, 4), (5, 12), (13, 17), (18, 23), (24, 26), (27, 30), (31, 36), (38, 44), (45, 48), (49, 51), (52, 55), (56, 58), (59, 64), (66, 73)] :param s: the string to be tokenized :type s: str :param regexp: regular expression that matches token separators :type regexp: str :rtype: iter(tuple(int, int)) """ left = 0 for m in finditer(regexp, s): right, next = m.span() if right != 0: yield left, right left = next yield left, len(s) def spans_to_relative(spans): r""" Return a sequence of relative spans, given a sequence of spans. >>> from nltk.tokenize import WhitespaceTokenizer >>> from nltk.tokenize.util import spans_to_relative >>> s = '''Good muffins cost $3.88\nin New York. Please buy me ... two of them.\n\nThanks.''' >>> list(spans_to_relative(WhitespaceTokenizer().span_tokenize(s))) [(0, 4), (1, 7), (1, 4), (1, 5), (1, 2), (1, 3), (1, 5), (2, 6), (1, 3), (1, 2), (1, 3), (1, 2), (1, 5), (2, 7)] :param spans: a sequence of (start, end) offsets of the tokens :type spans: iter(tuple(int, int)) :rtype: iter(tuple(int, int)) """ prev = 0 for left, right in spans: yield left - prev, right - left prev = right nltk-3.1/nltk/toolbox.py0000644000076500000240000004316312607224144015075 0ustar sbstaff00000000000000# coding: utf-8 # Natural Language Toolkit: Toolbox Reader # # Copyright (C) 2001-2015 NLTK Project # Author: Greg Aumann # URL: # For license information, see LICENSE.TXT """ Module for reading, writing and manipulating Toolbox databases and settings files. """ from __future__ import print_function import os, re, codecs from xml.etree.ElementTree import ElementTree, TreeBuilder, Element, SubElement from nltk.compat import StringIO, u, PY3 from nltk.data import PathPointer, ZipFilePathPointer, find class StandardFormat(object): """ Class for reading and processing standard format marker files and strings. """ def __init__(self, filename=None, encoding=None): self._encoding = encoding if filename is not None: self.open(filename) def open(self, sfm_file): """ Open a standard format marker file for sequential reading. :param sfm_file: name of the standard format marker input file :type sfm_file: str """ if isinstance(sfm_file, PathPointer): # [xx] We don't use 'rU' mode here -- do we need to? # (PathPointer.open doesn't take a mode option) self._file = sfm_file.open(self._encoding) else: self._file = codecs.open(sfm_file, 'rU', self._encoding) def open_string(self, s): """ Open a standard format marker string for sequential reading. :param s: string to parse as a standard format marker input file :type s: str """ self._file = StringIO(s) def raw_fields(self): """ Return an iterator that returns the next field in a (marker, value) tuple. Linebreaks and trailing white space are preserved except for the final newline in each field. :rtype: iter(tuple(str, str)) """ join_string = '\n' line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$' # discard a BOM in the first line first_line_pat = re.compile(line_regexp % '(?:\xef\xbb\xbf)?') line_pat = re.compile(line_regexp % '') # need to get first line outside the loop for correct handling # of the first marker if it spans multiple lines file_iter = iter(self._file) line = next(file_iter) mobj = re.match(first_line_pat, line) mkr, line_value = mobj.groups() value_lines = [line_value,] self.line_num = 0 for line in file_iter: self.line_num += 1 mobj = re.match(line_pat, line) line_mkr, line_value = mobj.groups() if line_mkr: yield (mkr, join_string.join(value_lines)) mkr = line_mkr value_lines = [line_value,] else: value_lines.append(line_value) self.line_num += 1 yield (mkr, join_string.join(value_lines)) def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None): """ Return an iterator that returns the next field in a ``(marker, value)`` tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding`` was specified in the ``fields()`` method. Otherwise they are non-unicode strings. :param strip: strip trailing whitespace from the last line of each field :type strip: bool :param unwrap: Convert newlines in a field to spaces. :type unwrap: bool :param encoding: Name of an encoding to use. If it is specified then the ``fields()`` method returns unicode strings rather than non unicode strings. :type encoding: str or None :param errors: Error handling scheme for codec. Same as the ``decode()`` builtin string method. :type errors: str :param unicode_fields: Set of marker names whose values are UTF-8 encoded. Ignored if encoding is None. If the whole file is UTF-8 encoded set ``encoding='utf8'`` and leave ``unicode_fields`` with its default value of None. :type unicode_fields: sequence :rtype: iter(tuple(str, str)) """ if encoding is None and unicode_fields is not None: raise ValueError('unicode_fields is set but not encoding.') unwrap_pat = re.compile(r'\n+') for mkr, val in self.raw_fields(): if encoding and not PY3: # kludge - already decoded in PY3? if unicode_fields is not None and mkr in unicode_fields: val = val.decode('utf8', errors) else: val = val.decode(encoding, errors) mkr = mkr.decode(encoding, errors) if unwrap: val = unwrap_pat.sub(' ', val) if strip: val = val.rstrip() yield (mkr, val) def close(self): """Close a previously opened standard format marker file or string.""" self._file.close() try: del self.line_num except AttributeError: pass class ToolboxData(StandardFormat): def parse(self, grammar=None, **kwargs): if grammar: return self._chunk_parse(grammar=grammar, **kwargs) else: return self._record_parse(**kwargs) def _record_parse(self, key=None, **kwargs): """ Returns an element tree structure corresponding to a toolbox data file with all markers at the same level. Thus the following Toolbox database:: \_sh v3.0 400 Rotokas Dictionary \_DateStampHasFourDigitYear \lx kaa \ps V.A \ge gag \gp nek i pas \lx kaa \ps V.B \ge strangle \gp pasim nek after parsing will end up with the same structure (ignoring the extra whitespace) as the following XML fragment after being parsed by ElementTree::
    <_sh>v3.0 400 Rotokas Dictionary <_DateStampHasFourDigitYear/>
    kaa V.A gag nek i pas kaa V.B strangle pasim nek
    :param key: Name of key marker at the start of each record. If set to None (the default value) the first marker that doesn't begin with an underscore is assumed to be the key. :type key: str :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` :type kwargs: dict :rtype: ElementTree._ElementInterface :return: contents of toolbox data divided into header and records """ builder = TreeBuilder() builder.start('toolbox_data', {}) builder.start('header', {}) in_records = False for mkr, value in self.fields(**kwargs): if key is None and not in_records and mkr[0] != '_': key = mkr if mkr == key: if in_records: builder.end('record') else: builder.end('header') in_records = True builder.start('record', {}) builder.start(mkr, {}) builder.data(value) builder.end(mkr) if in_records: builder.end('record') else: builder.end('header') builder.end('toolbox_data') return builder.close() def _tree2etree(self, parent): from nltk.tree import Tree root = Element(parent.label()) for child in parent: if isinstance(child, Tree): root.append(self._tree2etree(child)) else: text, tag = child e = SubElement(root, tag) e.text = text return root def _chunk_parse(self, grammar=None, root_label='record', trace=0, **kwargs): """ Returns an element tree structure corresponding to a toolbox data file parsed according to the chunk grammar. :type grammar: str :param grammar: Contains the chunking rules used to parse the database. See ``chunk.RegExp`` for documentation. :type root_label: str :param root_label: The node value that should be used for the top node of the chunk structure. :type trace: int :param trace: The level of tracing that should be used when parsing a text. ``0`` will generate no tracing output; ``1`` will generate normal tracing output; and ``2`` or higher will generate verbose tracing output. :type kwargs: dict :param kwargs: Keyword arguments passed to ``toolbox.StandardFormat.fields()`` :rtype: ElementTree._ElementInterface """ from nltk import chunk from nltk.tree import Tree cp = chunk.RegexpParser(grammar, root_label=root_label, trace=trace) db = self.parse(**kwargs) tb_etree = Element('toolbox_data') header = db.find('header') tb_etree.append(header) for record in db.findall('record'): parsed = cp.parse([(elem.text, elem.tag) for elem in record]) tb_etree.append(self._tree2etree(parsed)) return tb_etree _is_value = re.compile(r"\S") def to_sfm_string(tree, encoding=None, errors='strict', unicode_fields=None): """ Return a string with a standard format representation of the toolbox data in tree (tree can be a toolbox database or a single record). :param tree: flat representation of toolbox data (whole database or single record) :type tree: ElementTree._ElementInterface :param encoding: Name of an encoding to use. :type encoding: str :param errors: Error handling scheme for codec. Same as the ``encode()`` builtin string method. :type errors: str :param unicode_fields: :type unicode_fields: dict(str) or set(str) :rtype: str """ if tree.tag == 'record': root = Element('toolbox_data') root.append(tree) tree = root if tree.tag != 'toolbox_data': raise ValueError("not a toolbox_data element structure") if encoding is None and unicode_fields is not None: raise ValueError("if encoding is not specified then neither should unicode_fields") l = [] for rec in tree: l.append('\n') for field in rec: mkr = field.tag value = field.text if encoding is not None: if unicode_fields is not None and mkr in unicode_fields: cur_encoding = 'utf8' else: cur_encoding = encoding if re.search(_is_value, value): l.append((u("\\%s %s\n") % (mkr, value)).encode(cur_encoding, errors)) else: l.append((u("\\%s%s\n") % (mkr, value)).encode(cur_encoding, errors)) else: if re.search(_is_value, value): l.append("\\%s %s\n" % (mkr, value)) else: l.append("\\%s%s\n" % (mkr, value)) return ''.join(l[1:]) class ToolboxSettings(StandardFormat): """This class is the base class for settings files.""" def __init__(self): super(ToolboxSettings, self).__init__() def parse(self, encoding=None, errors='strict', **kwargs): """ Return the contents of toolbox settings file with a nested structure. :param encoding: encoding used by settings file :type encoding: str :param errors: Error handling scheme for codec. Same as ``decode()`` builtin method. :type errors: str :param kwargs: Keyword arguments passed to ``StandardFormat.fields()`` :type kwargs: dict :rtype: ElementTree._ElementInterface """ builder = TreeBuilder() for mkr, value in self.fields(encoding=encoding, errors=errors, **kwargs): # Check whether the first char of the field marker # indicates a block start (+) or end (-) block=mkr[0] if block in ("+", "-"): mkr=mkr[1:] else: block=None # Build tree on the basis of block char if block == "+": builder.start(mkr, {}) builder.data(value) elif block == '-': builder.end(mkr) else: builder.start(mkr, {}) builder.data(value) builder.end(mkr) return builder.close() def to_settings_string(tree, encoding=None, errors='strict', unicode_fields=None): # write XML to file l = list() _to_settings_string(tree.getroot(), l, encoding=encoding, errors=errors, unicode_fields=unicode_fields) return ''.join(l) def _to_settings_string(node, l, **kwargs): # write XML to file tag = node.tag text = node.text if len(node) == 0: if text: l.append('\\%s %s\n' % (tag, text)) else: l.append('\\%s\n' % tag) else: if text: l.append('\\+%s %s\n' % (tag, text)) else: l.append('\\+%s\n' % tag) for n in node: _to_settings_string(n, l, **kwargs) l.append('\\-%s\n' % tag) return def remove_blanks(elem): """ Remove all elements and subelements with no text and no child elements. :param elem: toolbox data in an elementtree structure :type elem: ElementTree._ElementInterface """ out = list() for child in elem: remove_blanks(child) if child.text or len(child) > 0: out.append(child) elem[:] = out def add_default_fields(elem, default_fields): """ Add blank elements and subelements specified in default_fields. :param elem: toolbox data in an elementtree structure :type elem: ElementTree._ElementInterface :param default_fields: fields to add to each type of element and subelement :type default_fields: dict(tuple) """ for field in default_fields.get(elem.tag, []): if elem.find(field) is None: SubElement(elem, field) for child in elem: add_default_fields(child, default_fields) def sort_fields(elem, field_orders): """ Sort the elements and subelements in order specified in field_orders. :param elem: toolbox data in an elementtree structure :type elem: ElementTree._ElementInterface :param field_orders: order of fields for each type of element and subelement :type field_orders: dict(tuple) """ order_dicts = dict() for field, order in field_orders.items(): order_dicts[field] = order_key = dict() for i, subfield in enumerate(order): order_key[subfield] = i _sort_fields(elem, order_dicts) def _sort_fields(elem, orders_dicts): """sort the children of elem""" try: order = orders_dicts[elem.tag] except KeyError: pass else: tmp = sorted([((order.get(child.tag, 1e9), i), child) for i, child in enumerate(elem)]) elem[:] = [child for key, child in tmp] for child in elem: if len(child): _sort_fields(child, orders_dicts) def add_blank_lines(tree, blanks_before, blanks_between): """ Add blank lines before all elements and subelements specified in blank_before. :param elem: toolbox data in an elementtree structure :type elem: ElementTree._ElementInterface :param blank_before: elements and subelements to add blank lines before :type blank_before: dict(tuple) """ try: before = blanks_before[tree.tag] between = blanks_between[tree.tag] except KeyError: for elem in tree: if len(elem): add_blank_lines(elem, blanks_before, blanks_between) else: last_elem = None for elem in tree: tag = elem.tag if last_elem is not None and last_elem.tag != tag: if tag in before and last_elem is not None: e = last_elem.getiterator()[-1] e.text = (e.text or "") + "\n" else: if tag in between: e = last_elem.getiterator()[-1] e.text = (e.text or "") + "\n" if len(elem): add_blank_lines(elem, blanks_before, blanks_between) last_elem = elem def demo(): from itertools import islice # zip_path = find('corpora/toolbox.zip') # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse() file_path = find('corpora/toolbox/rotokas.dic') lexicon = ToolboxData(file_path).parse() print('first field in fourth record:') print(lexicon[3][0].tag) print(lexicon[3][0].text) print('\nfields in sequential order:') for field in islice(lexicon.find('record'), 10): print(field.tag, field.text) print('\nlx fields:') for field in islice(lexicon.findall('record/lx'), 10): print(field.text) settings = ToolboxSettings() file_path = find('corpora/toolbox/MDF/MDF_AltH.typ') settings.open(file_path) # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ')) tree = settings.parse(unwrap=False, encoding='cp1252') print(tree.find('expset/expMDF/rtfPageSetup/paperSize').text) settings_tree = ElementTree(tree) print(to_settings_string(settings_tree).encode('utf8')) if __name__ == '__main__': demo() nltk-3.1/nltk/translate/0000755000076500000240000000000012610001541015007 5ustar sbstaff00000000000000nltk-3.1/nltk/translate/__init__.py0000644000076500000240000000150112607224144017131 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Machine Translation # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird , Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Experimental features for machine translation. These interfaces are prone to change. """ from nltk.translate.api import AlignedSent, Alignment, PhraseTable from nltk.translate.ibm_model import IBMModel from nltk.translate.ibm1 import IBMModel1 from nltk.translate.ibm2 import IBMModel2 from nltk.translate.ibm3 import IBMModel3 from nltk.translate.ibm4 import IBMModel4 from nltk.translate.ibm5 import IBMModel5 from nltk.translate.bleu_score import bleu from nltk.translate.metrics import alignment_error_rate from nltk.translate.stack_decoder import StackDecoder nltk-3.1/nltk/translate/api.py0000644000076500000240000002513212607224144016151 0ustar sbstaff00000000000000# Natural Language Toolkit: API for alignment and translation objects # # Copyright (C) 2001-2015 NLTK Project # Author: Will Zhang # Guan Gui # Steven Bird # Tah Wei Hoon # URL: # For license information, see LICENSE.TXT from __future__ import print_function, unicode_literals import subprocess from collections import namedtuple from nltk.compat import python_2_unicode_compatible, string_types @python_2_unicode_compatible class AlignedSent(object): """ Return an aligned sentence object, which encapsulates two sentences along with an ``Alignment`` between them. >>> from nltk.translate import AlignedSent, Alignment >>> algnsent = AlignedSent(['klein', 'ist', 'das', 'Haus'], ... ['the', 'house', 'is', 'small'], Alignment.fromstring('0-2 1-3 2-1 3-0')) >>> algnsent.words ['klein', 'ist', 'das', 'Haus'] >>> algnsent.mots ['the', 'house', 'is', 'small'] >>> algnsent.alignment Alignment([(0, 2), (1, 3), (2, 1), (3, 0)]) >>> from nltk.corpus import comtrans >>> print(comtrans.aligned_sents()[54]) 'So why should EU arm...'> >>> print(comtrans.aligned_sents()[54].alignment) 0-0 0-1 1-0 2-2 3-4 3-5 4-7 5-8 6-3 7-9 8-9 9-10 9-11 10-12 11-6 12-6 13-13 :param words: source language words :type words: list(str) :param mots: target language words :type mots: list(str) :param alignment: the word-level alignments between the source and target language :type alignment: Alignment """ def __init__(self, words, mots, alignment=None): self._words = words self._mots = mots if alignment is None: self.alignment = Alignment([]) else: assert type(alignment) is Alignment self.alignment = alignment @property def words(self): return self._words @property def mots(self): return self._mots def _get_alignment(self): return self._alignment def _set_alignment(self, alignment): _check_alignment(len(self.words), len(self.mots), alignment) self._alignment = alignment alignment = property(_get_alignment, _set_alignment) def __repr__(self): """ Return a string representation for this ``AlignedSent``. :rtype: str """ words = "[%s]" % (", ".join("'%s'" % w for w in self._words)) mots = "[%s]" % (", ".join("'%s'" % w for w in self._mots)) return "AlignedSent(%s, %s, %r)" % (words, mots, self._alignment) def _to_dot(self): """ Dot representation of the aligned sentence """ s = 'graph align {\n' s += 'node[shape=plaintext]\n' # Declare node for w in self._words: s += '"%s_source" [label="%s"] \n' % (w, w) for w in self._mots: s += '"%s_target" [label="%s"] \n' % (w, w) # Alignment for u,v in self._alignment: s += '"%s_source" -- "%s_target" \n' % (self._words[u] , self._mots[v] ) # Connect the source words for i in range(len(self._words)-1) : s += '"%s_source" -- "%s_source" [style=invis]\n' % (self._words[i] , self._words[i+1]) # Connect the target words for i in range(len(self._mots)-1) : s += '"%s_target" -- "%s_target" [style=invis]\n' % (self._mots[i] , self._mots[i+1]) # Put it in the same rank s += '{rank = same; %s}\n' % (' '.join('"%s_source"' % w for w in self._words)) s += '{rank = same; %s}\n' % (' '.join('"%s_target"' % w for w in self._mots)) s += '}' return s def _repr_svg_(self): """ Ipython magic : show SVG representation of this ``AlignedSent``. """ dot_string = self._to_dot().encode('utf8') output_format = 'svg' try: process = subprocess.Popen(['dot', '-T%s' % output_format], stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except OSError: raise Exception('Cannot find the dot binary from Graphviz package') out, err = process.communicate(dot_string) return out def __str__(self): """ Return a human-readable string representation for this ``AlignedSent``. :rtype: str """ source = " ".join(self._words)[:20] + "..." target = " ".join(self._mots)[:20] + "..." return " '%s'>" % (source, target) def invert(self): """ Return the aligned sentence pair, reversing the directionality :rtype: AlignedSent """ return AlignedSent(self._mots, self._words, self._alignment.invert()) @python_2_unicode_compatible class Alignment(frozenset): """ A storage class for representing alignment between two sequences, s1, s2. In general, an alignment is a set of tuples of the form (i, j, ...) representing an alignment between the i-th element of s1 and the j-th element of s2. Tuples are extensible (they might contain additional data, such as a boolean to indicate sure vs possible alignments). >>> from nltk.translate import Alignment >>> a = Alignment([(0, 0), (0, 1), (1, 2), (2, 2)]) >>> a.invert() Alignment([(0, 0), (1, 0), (2, 1), (2, 2)]) >>> print(a.invert()) 0-0 1-0 2-1 2-2 >>> a[0] [(0, 1), (0, 0)] >>> a.invert()[2] [(2, 1), (2, 2)] >>> b = Alignment([(0, 0), (0, 1)]) >>> b.issubset(a) True >>> c = Alignment.fromstring('0-0 0-1') >>> b == c True """ def __new__(cls, pairs): self = frozenset.__new__(cls, pairs) self._len = (max(p[0] for p in self) if self != frozenset([]) else 0) self._index = None return self @classmethod def fromstring(cls, s): """ Read a giza-formatted string and return an Alignment object. >>> Alignment.fromstring('0-0 2-1 9-2 21-3 10-4 7-5') Alignment([(0, 0), (2, 1), (7, 5), (9, 2), (10, 4), (21, 3)]) :type s: str :param s: the positional alignments in giza format :rtype: Alignment :return: An Alignment object corresponding to the string representation ``s``. """ return Alignment([_giza2pair(a) for a in s.split()]) def __getitem__(self, key): """ Look up the alignments that map from a given index or slice. """ if not self._index: self._build_index() return self._index.__getitem__(key) def invert(self): """ Return an Alignment object, being the inverted mapping. """ return Alignment(((p[1], p[0]) + p[2:]) for p in self) def range(self, positions=None): """ Work out the range of the mapping from the given positions. If no positions are specified, compute the range of the entire mapping. """ image = set() if not self._index: self._build_index() if not positions: positions = list(range(len(self._index))) for p in positions: image.update(f for _,f in self._index[p]) return sorted(image) def __repr__(self): """ Produce a Giza-formatted string representing the alignment. """ return "Alignment(%r)" % sorted(self) def __str__(self): """ Produce a Giza-formatted string representing the alignment. """ return " ".join("%d-%d" % p[:2] for p in sorted(self)) def _build_index(self): """ Build a list self._index such that self._index[i] is a list of the alignments originating from word i. """ self._index = [[] for _ in range(self._len + 1)] for p in self: self._index[p[0]].append(p) def _giza2pair(pair_string): i, j = pair_string.split("-") return int(i), int(j) def _naacl2pair(pair_string): i, j, p = pair_string.split("-") return int(i), int(j) def _check_alignment(num_words, num_mots, alignment): """ Check whether the alignments are legal. :param num_words: the number of source language words :type num_words: int :param num_mots: the number of target language words :type num_mots: int :param alignment: alignment to be checked :type alignment: Alignment :raise IndexError: if alignment falls outside the sentence """ assert type(alignment) is Alignment if not all(0 <= pair[0] < num_words for pair in alignment): raise IndexError("Alignment is outside boundary of words") if not all(pair[1] is None or 0 <= pair[1] < num_mots for pair in alignment): raise IndexError("Alignment is outside boundary of mots") PhraseTableEntry = namedtuple('PhraseTableEntry', ['trg_phrase', 'log_prob']) class PhraseTable(object): """ In-memory store of translations for a given phrase, and the log probability of the those translations """ def __init__(self): self.src_phrases = dict() def translations_for(self, src_phrase): """ Get the translations for a source language phrase :param src_phrase: Source language phrase of interest :type src_phrase: tuple(str) :return: A list of target language phrases that are translations of ``src_phrase``, ordered in decreasing order of likelihood. Each list element is a tuple of the target phrase and its log probability. :rtype: list(PhraseTableEntry) """ return self.src_phrases[src_phrase] def add(self, src_phrase, trg_phrase, log_prob): """ :type src_phrase: tuple(str) :type trg_phrase: tuple(str) :param log_prob: Log probability that given ``src_phrase``, ``trg_phrase`` is its translation :type log_prob: float """ entry = PhraseTableEntry(trg_phrase=trg_phrase, log_prob=log_prob) if src_phrase not in self.src_phrases: self.src_phrases[src_phrase] = [] self.src_phrases[src_phrase].append(entry) self.src_phrases[src_phrase].sort(key=lambda e: e.log_prob, reverse=True) def __contains__(self, src_phrase): return src_phrase in self.src_phrases nltk-3.1/nltk/translate/bleu_score.py0000644000076500000240000002273112607224144017524 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: BLEU Score # # Copyright (C) 2001-2015 NLTK Project # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim # Contributors: Dmitrijs Milajevs # URL: # For license information, see LICENSE.TXT """BLEU score implementation.""" from __future__ import division import math from nltk.tokenize import word_tokenize from nltk.compat import Counter from nltk.util import ngrams def bleu(references, hypothesis, weights): """ Calculate BLEU score (Bilingual Evaluation Understudy) from Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002. "BLEU: a method for automatic evaluation of machine translation." In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf >>> weights = [0.25, 0.25, 0.25, 0.25] >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ... 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', 'forever', ... 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> bleu([reference1, reference2, reference3], hypothesis1, weights) 0.5045666840058485 >>> bleu([reference1, reference2, reference3], hypothesis2, weights) 0 :param references: reference sentences :type references: list(list(str)) :param hypothesis: a hypothesis sentence :type hypothesis: list(str) :param weights: weights for unigrams, bigrams, trigrams and so on :type weights: list(float) """ p_ns = ( _modified_precision(references, hypothesis, i) for i, _ in enumerate(weights, start=1) ) try: s = math.fsum(w * math.log(p_n) for w, p_n in zip(weights, p_ns)) except ValueError: # some p_ns is 0 return 0 bp = _brevity_penalty(references, hypothesis) return bp * math.exp(s) def _modified_precision(references, hypothesis, n): """ Calculate modified ngram precision. The normal precision method may lead to some wrong translations with high-precision, e.g., the translation, in which a word of reference repeats several times, has very high precision. The famous "the the the ... " example shows that you can get BLEU precision by duplicating high frequency words. >>> reference1 = 'the cat is on the mat'.split() >>> reference2 = 'there is a cat on the mat'.split() >>> hypothesis1 = 'the the the the the the the'.split() >>> references = [reference1, reference2] >>> _modified_precision(references, hypothesis1, n=1) 0.2857142857142857 In the modified n-gram precision, a reference word will be considered exhausted after a matching hypothesis word is identified, e.g. >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', ... 'forever', 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> hypothesis = 'of the'.split() >>> references = [reference1, reference2, reference3] >>> _modified_precision(references, hypothesis, n=1) 1.0 >>> _modified_precision(references, hypothesis, n=2) 1.0 An example of a normal machine translation hypothesis: >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', ... 'ensures', 'that', 'the', 'military', 'always', ... 'obeys', 'the', 'commands', 'of', 'the', 'party'] >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops', ... 'forever', 'hearing', 'the', 'activity', 'guidebook', ... 'that', 'party', 'direct'] >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', ... 'ensures', 'that', 'the', 'military', 'will', ... 'forever', 'heed', 'Party', 'commands'] >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which', ... 'guarantees', 'the', 'military', 'forces', 'always', ... 'being', 'under', 'the', 'command', 'of', 'the', ... 'Party'] >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the', ... 'army', 'always', 'to', 'heed', 'the', 'directions', ... 'of', 'the', 'party'] >>> references = [reference1, reference2, reference3] >>> _modified_precision(references, hypothesis1, n=1) 0.9444444444444444 >>> _modified_precision(references, hypothesis2, n=1) 0.5714285714285714 >>> _modified_precision(references, hypothesis1, n=2) 0.5882352941176471 >>> _modified_precision(references, hypothesis2, n=2) 0.07692307692307693 :param references: A list of reference translations. :type references: list(list(str)) :param hypothesis: A hypothesis translation. :type hypothesis: list(str) :param n: The ngram order. :type n: int """ counts = Counter(ngrams(hypothesis, n)) if not counts: return 0 max_counts = {} for reference in references: reference_counts = Counter(ngrams(reference, n)) for ngram in counts: max_counts[ngram] = max(max_counts.get(ngram, 0), reference_counts[ngram]) clipped_counts = dict((ngram, min(count, max_counts[ngram])) for ngram, count in counts.items()) return sum(clipped_counts.values()) / sum(counts.values()) def _brevity_penalty(references, hypothesis): """ Calculate brevity penalty. As the modified n-gram precision still has the problem from the short length sentence, brevity penalty is used to modify the overall BLEU score according to length. An example from the paper. There are three references with length 12, 15 and 17. And a concise hypothesis of the length 12. The brevity penalty is 1. >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15 >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17 >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12 >>> references = [reference1, reference2, reference3] >>> _brevity_penalty(references, hypothesis) 1.0 In case a hypothesis translation is shorter than the references, penalty is applied. >>> references = [['a'] * 28, ['a'] * 28] >>> hypothesis = ['a'] * 12 >>> _brevity_penalty(references, hypothesis) 0.2635971381157267 The length of the closest reference is used to compute the penalty. If the length of a hypothesis is 12, and the reference lengths are 13 and 2, the penalty is applied because the hypothesis length (12) is less then the closest reference length (13). >>> references = [['a'] * 13, ['a'] * 2] >>> hypothesis = ['a'] * 12 >>> _brevity_penalty(references, hypothesis) 0.9200444146293233 The brevity penalty doesn't depend on reference order. More importantly, when two reference sentences are at the same distance, the shortest reference sentence length is used. >>> references = [['a'] * 13, ['a'] * 11] >>> hypothesis = ['a'] * 12 >>> bp1 = _brevity_penalty(references, hypothesis) >>> bp2 = _brevity_penalty(reversed(references),hypothesis) >>> bp1 == bp2 == 1 True A test example from mteval-v13a.pl (starting from the line 705): >>> references = [['a'] * 11, ['a'] * 8] >>> hypothesis = ['a'] * 7 >>> _brevity_penalty(references, hypothesis) 0.8668778997501817 >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7] >>> hypothesis = ['a'] * 7 >>> _brevity_penalty(references, hypothesis) 1.0 :param references: A list of reference translations. :type references: list(list(str)) :param hypothesis: A hypothesis translation. :type hypothesis: list(str) """ c = len(hypothesis) ref_lens = (len(reference) for reference in references) r = min(ref_lens, key=lambda ref_len: (abs(ref_len - c), ref_len)) if c > r: return 1 else: return math.exp(1 - r / c) nltk-3.1/nltk/translate/gale_church.py0000644000076500000240000001663412607224144017653 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Gale-Church Aligner # # Copyright (C) 2001-2013 NLTK Project # Author: Torsten Marek # URL: # For license information, see LICENSE.TXT """ A port of the Gale-Church Aligner. Gale & Church (1993), A Program for Aligning Sentences in Bilingual Corpora. http://aclweb.org/anthology/J93-1004.pdf """ from __future__ import division import math try: from scipy.stats import norm from norm import logsf as norm_logsf except ImportError: def erfcc(x): """Complementary error function.""" z = abs(x) t = 1 / (1 + 0.5 * z) r = t * math.exp(-z * z - 1.26551223 + t * (1.00002368 + t * (.37409196 + t * (.09678418 + t * (-.18628806 + t * (.27886807 + t * (-1.13520398 + t * (1.48851587 + t * (-.82215223 + t * .17087277))))))))) if x >= 0.: return r else: return 2. - r def norm_cdf(x): """Return the area under the normal distribution from M{-∞..x}.""" return 1 - 0.5 * erfcc(x / math.sqrt(2)) def norm_logsf(x): try: return math.log(1 - norm_cdf(x)) except ValueError: return float('-inf') LOG2 = math.log(2) class LanguageIndependent(object): # These are the language-independent probabilities and parameters # given in Gale & Church # for the computation, l_1 is always the language with less characters PRIORS = { (1, 0): 0.0099, (0, 1): 0.0099, (1, 1): 0.89, (2, 1): 0.089, (1, 2): 0.089, (2, 2): 0.011, } AVERAGE_CHARACTERS = 1 VARIANCE_CHARACTERS = 6.8 def trace(backlinks, source, target): links = [] pos = (len(source), len(target)) while pos != (0, 0): s, t = backlinks[pos] for i in range(s): for j in range(t): links.append((pos[0] - i - 1, pos[1] - j - 1)) pos = (pos[0] - s, pos[1] - t) return links[::-1] def align_log_prob(i, j, source_sents, target_sents, alignment, params): """Returns the log probability of the two sentences C{source_sents[i]}, C{target_sents[j]} being aligned with a specific C{alignment}. @param i: The offset of the source sentence. @param j: The offset of the target sentence. @param source_sents: The list of source sentence lengths. @param target_sents: The list of target sentence lengths. @param alignment: The alignment type, a tuple of two integers. @param params: The sentence alignment parameters. @returns: The log probability of a specific alignment between the two sentences, given the parameters. """ l_s = sum(source_sents[i - offset - 1] for offset in range(alignment[0])) l_t = sum(target_sents[j - offset - 1] for offset in range(alignment[1])) try: # actually, the paper says l_s * params.VARIANCE_CHARACTERS, this is based on the C # reference implementation. With l_s in the denominator, insertions are impossible. m = (l_s + l_t / params.AVERAGE_CHARACTERS) / 2 delta = (l_s * params.AVERAGE_CHARACTERS - l_t) / math.sqrt(m * params.VARIANCE_CHARACTERS) except ZeroDivisionError: return float('-inf') return - (LOG2 + norm_logsf(abs(delta)) + math.log(params.PRIORS[alignment])) def align_blocks(source_sents, target_sents, params = LanguageIndependent): """Return the sentence alignment of two text blocks (usually paragraphs). >>> align_blocks([5,5,5], [7,7,7]) [(0, 0), (1, 1), (2, 2)] >>> align_blocks([10,5,5], [12,20]) [(0, 0), (1, 1), (2, 1)] >>> align_blocks([12,20], [10,5,5]) [(0, 0), (1, 1), (1, 2)] >>> align_blocks([10,2,10,10,2,10], [12,3,20,3,12]) [(0, 0), (1, 1), (2, 2), (3, 2), (4, 3), (5, 4)] @param source_sents: The list of source sentence lengths. @param target_sents: The list of target sentence lengths. @param params: the sentence alignment parameters. @return: The sentence alignments, a list of index pairs. """ alignment_types = list(params.PRIORS.keys()) # there are always three rows in the history (with the last of them being filled) D = [[]] backlinks = {} for i in range(len(source_sents) + 1): for j in range(len(target_sents) + 1): min_dist = float('inf') min_align = None for a in alignment_types: prev_i = - 1 - a[0] prev_j = j - a[1] if prev_i < -len(D) or prev_j < 0: continue p = D[prev_i][prev_j] + align_log_prob(i, j, source_sents, target_sents, a, params) if p < min_dist: min_dist = p min_align = a if min_dist == float('inf'): min_dist = 0 backlinks[(i, j)] = min_align D[-1].append(min_dist) if len(D) > 2: D.pop(0) D.append([]) return trace(backlinks, source_sents, target_sents) def align_texts(source_blocks, target_blocks, params = LanguageIndependent): """Creates the sentence alignment of two texts. Texts can consist of several blocks. Block boundaries cannot be crossed by sentence alignment links. Each block consists of a list that contains the lengths (in characters) of the sentences in this block. @param source_blocks: The list of blocks in the source text. @param target_blocks: The list of blocks in the target text. @param params: the sentence alignment parameters. @returns: A list of sentence alignment lists """ if len(source_blocks) != len(target_blocks): raise ValueError("Source and target texts do not have the same number of blocks.") return [align_blocks(source_block, target_block, params) for source_block, target_block in zip(source_blocks, target_blocks)] # File I/O functions; may belong in a corpus reader def split_at(it, split_value): """Splits an iterator C{it} at values of C{split_value}. Each instance of C{split_value} is swallowed. The iterator produces subiterators which need to be consumed fully before the next subiterator can be used. """ def _chunk_iterator(first): v = first while v != split_value: yield v v = it.next() while True: yield _chunk_iterator(it.next()) def parse_token_stream(stream, soft_delimiter, hard_delimiter): """Parses a stream of tokens and splits it into sentences (using C{soft_delimiter} tokens) and blocks (using C{hard_delimiter} tokens) for use with the L{align_texts} function. """ return [ [sum(len(token) for token in sentence_it) for sentence_it in split_at(block_it, soft_delimiter)] for block_it in split_at(stream, hard_delimiter)] # Code for test files in nltk_contrib/align/data/*.tok # import sys # from contextlib import nested # with nested(open(sys.argv[1], "r"), open(sys.argv[2], "r")) as (s, t): # source = parse_token_stream((l.strip() for l in s), ".EOS", ".EOP") # target = parse_token_stream((l.strip() for l in t), ".EOS", ".EOP") # print align_texts(source, target) nltk-3.1/nltk/translate/gdfa.py0000644000076500000240000001340512607224144016301 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: GDFA word alignment symmetrization # # Copyright (C) 2001-2015 NLTK Project # Authors: Liling Tan # URL: # For license information, see LICENSE.TXT import codecs from collections import defaultdict def grow_diag_final_and(srclen, trglen, e2f, f2e): """ This module symmetrisatizes the source-to-target and target-to-source word alignment output and produces, aka. GDFA algorithm (Koehn, 2005). Step 1: Find the intersection of the bidirectional alignment. Step 2: Search for additional neighbor alignment points to be added, given these criteria: (i) neighbor alignments points are not in the intersection and (ii) neighbor alignments are in the union. Step 3: Add all other alignment points thats not in the intersection, not in the neighboring alignments that met the criteria but in the original foward/backward alignment outputs. >>> forw = ('0-0 2-1 9-2 21-3 10-4 7-5 11-6 9-7 12-8 1-9 3-10 ' ... '4-11 17-12 17-13 25-14 13-15 24-16 11-17 28-18') >>> back = ('0-0 1-9 2-9 3-10 4-11 5-12 6-6 7-5 8-6 9-7 10-4 ' ... '11-6 12-8 13-12 15-12 17-13 18-13 19-12 20-13 ' ... '21-3 22-12 23-14 24-17 25-15 26-17 27-18 28-18') >>> srctext = ("ã“㮠よㆠ㪠ãƒãƒ­ãƒ¼ 白色 ã‚ㄠ星 㮠L 関数 " ... "㯠L 㨠共 ã« ä¸é€£ç¶š 㫠増加 ã™ã‚‹ ã“㨠㌠" ... "期待 㕠れる ã“㨠を 示㗠㟠。") >>> trgtext = ("Therefore , we expect that the luminosity function " ... "of such halo white dwarfs increases discontinuously " ... "with the luminosity .") >>> srclen = len(srctext.split()) >>> trglen = len(trgtext.split()) >>> >>> gdfa = grow_diag_final_and(srclen, trglen, forw, back) >>> gdfa == set([(28, 18), (6, 6), (24, 17), (2, 1), (15, 12), (13, 12), ... (2, 9), (3, 10), (26, 17), (25, 15), (8, 6), (9, 7), (20, ... 13), (18, 13), (0, 0), (10, 4), (13, 15), (23, 14), (7, 5), ... (25, 14), (1, 9), (17, 13), (4, 11), (11, 17), (9, 2), (22, ... 12), (27, 18), (24, 16), (21, 3), (19, 12), (17, 12), (5, ... 12), (11, 6), (12, 8)]) True References: Koehn, P., A. Axelrod, A. Birch, C. Callison, M. Osborne, and D. Talbot. 2005. Edinburgh System Description for the 2005 IWSLT Speech Translation Evaluation. In MT Eval Workshop. :type srclen: int :param srclen: the number of tokens in the source language :type trglen: int :param trglen: the number of tokens in the target language :type e2f: str :param e2f: the forward word alignment outputs from source-to-target language (in pharaoh output format) :type f2e: str :param f2e: the backward word alignment outputs from target-to-source language (in pharaoh output format) :rtype: set(tuple(int)) :return: the symmetrized alignment points from the GDFA algorithm """ # Converts pharaoh text format into list of tuples. e2f = [tuple(map(int,a.split('-'))) for a in e2f.split()] f2e = [tuple(map(int,a.split('-'))) for a in f2e.split()] neighbors = [(-1,0),(0,-1),(1,0),(0,1),(-1,-1),(-1,1),(1,-1),(1,1)] alignment = set(e2f).intersection(set(f2e)) # Find the intersection. union = set(e2f).union(set(f2e)) # *aligned* is used to check if neighbors are aligned in grow_diag() aligned = defaultdict(set) for i,j in alignment: aligned['e'].add(i) aligned['j'].add(j) def grow_diag(): """ Search for the neighbor points and them to the intersected alignment points if criteria are met. """ prev_len = len(alignment) - 1 # iterate until no new points added while prev_len < len(alignment): # for english word e = 0 ... en for e in range(srclen): # for foreign word f = 0 ... fn for f in range(trglen): # if ( e aligned with f) if (e,f) in alignment: # for each neighboring point (e-new, f-new) for neighbor in neighbors: neighbor = tuple(i+j for i,j in zip((e,f),neighbor)) e_new, f_new = neighbor # if ( ( e-new not aligned and f-new not aligned) # and (e-new, f-new in union(e2f, f2e) ) if (e_new not in aligned and f_new not in aligned)\ and neighbor in union: alignment.add(neighbor) aligned['e'].add(e_new); aligned['f'].add(f_new) prev_len+=1 def final_and(a): """ Adds remaining points that are not in the intersection, not in the neighboring alignments but in the original *e2f* and *f2e* alignments """ # for english word e = 0 ... en for e_new in range(srclen): # for foreign word f = 0 ... fn for f_new in range(trglen): # if ( ( e-new not aligned and f-new not aligned) # and (e-new, f-new in union(e2f, f2e) ) if (e_new not in aligned and f_new not in aligned and (e_new, f_new) in a): alignment.add((e_new, f_new)) aligned['e'].add(e_new); aligned['f'].add(f_new) grow_diag() final_and(e2f) final_and(f2e) return alignment nltk-3.1/nltk/translate/ibm1.py0000644000076500000240000002147612607224144016237 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: IBM Model 1 # # Copyright (C) 2001-2013 NLTK Project # Author: Chin Yee Lee # Hengfeng Li # Ruxin Hou # Calvin Tanujaya Lim # Based on earlier version by: # Will Zhang # Guan Gui # URL: # For license information, see LICENSE.TXT """ Lexical translation model that ignores word order. In IBM Model 1, word order is ignored for simplicity. Thus, the following two alignments are equally likely. Source: je mange du jambon Target: i eat some ham Alignment: (1,1) (2,2) (3,3) (4,4) Source: je mange du jambon Target: some ham eat i Alignment: (1,4) (2,3) (3,2) (4,1) The EM algorithm used in Model 1 is: E step - In the training data, count how many times a source language word is translated into a target language word, weighted by the prior probability of the translation. M step - Estimate the new probability of translation based on the counts from the Expectation step. Notations: i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence s: A word in the source language t: A word in the target language References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ from __future__ import division from collections import defaultdict from nltk.translate import AlignedSent from nltk.translate import Alignment from nltk.translate import IBMModel from nltk.translate.ibm_model import Counts import warnings class IBMModel1(IBMModel): """ Lexical translation model that ignores word order >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> ibm1 = IBMModel1(bitext, 5) >>> print(ibm1.translation_table['buch']['book']) 0.889... >>> print(ibm1.translation_table['das']['book']) 0.061... >>> print(ibm1.translation_table['buch'][None]) 0.113... >>> print(ibm1.translation_table['ja'][None]) 0.072... >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) """ def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, the following entry must be present: ``translation_table``. See ``IBMModel`` for the type and purpose of this table. :type probability_tables: dict[str]: object """ super(IBMModel1, self).__init__(sentence_aligned_corpus) if probability_tables is None: self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables['translation_table'] for n in range(0, iterations): self.train(sentence_aligned_corpus) self.__align_all(sentence_aligned_corpus) def set_uniform_probabilities(self, sentence_aligned_corpus): initial_prob = 1 / len(self.trg_vocab) if initial_prob < IBMModel.MIN_PROB: warnings.warn("Target language vocabulary is too large (" + str(len(self.trg_vocab)) + " words). " "Results may be less accurate.") for t in self.trg_vocab: self.translation_table[t] = defaultdict(lambda: initial_prob) def train(self, parallel_corpus): counts = Counts() for aligned_sentence in parallel_corpus: trg_sentence = aligned_sentence.words src_sentence = [None] + aligned_sentence.mots # E step (a): Compute normalization factors to weigh counts total_count = self.prob_all_alignments(src_sentence, trg_sentence) # E step (b): Collect counts for t in trg_sentence: for s in src_sentence: count = self.prob_alignment_point(s, t) normalized_count = count / total_count[t] counts.t_given_s[t][s] += normalized_count counts.any_t_given_s[s] += normalized_count # M step: Update probabilities with maximum likelihood estimate self.maximize_lexical_translation_probabilities(counts) def prob_all_alignments(self, src_sentence, trg_sentence): """ Computes the probability of all possible word alignments, expressed as a marginal distribution over target words t Each entry in the return value represents the contribution to the total alignment probability by the target word t. To obtain probability(alignment | src_sentence, trg_sentence), simply sum the entries in the return value. :return: Probability of t for all s in ``src_sentence`` :rtype: dict(str): float """ alignment_prob_for_t = defaultdict(lambda: 0.0) for t in trg_sentence: for s in src_sentence: alignment_prob_for_t[t] += self.prob_alignment_point(s, t) return alignment_prob_for_t def prob_alignment_point(self, s, t): """ Probability that word ``t`` in the target sentence is aligned to word ``s`` in the source sentence """ return self.translation_table[t][s] def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ prob = 1.0 for j, i in enumerate(alignment_info.alignment): if j == 0: continue # skip the dummy zeroeth element trg_word = alignment_info.trg_sentence[j] src_word = alignment_info.src_sentence[i] prob *= self.translation_table[trg_word][src_word] return max(prob, IBMModel.MIN_PROB) def __align_all(self, parallel_corpus): for sentence_pair in parallel_corpus: self.__align(sentence_pair) def __align(self, sentence_pair): """ Determines the best word alignment for one sentence pair from the corpus that the model was trained on. The best alignment will be set in ``sentence_pair`` when the method returns. In contrast with the internal implementation of IBM models, the word indices in the ``Alignment`` are zero- indexed, not one-indexed. :param sentence_pair: A sentence in the source language and its counterpart sentence in the target language :type sentence_pair: AlignedSent """ best_alignment = [] for j, trg_word in enumerate(sentence_pair.words): # Initialize trg_word to align with the NULL token best_prob = max(self.translation_table[trg_word][None], IBMModel.MIN_PROB) best_alignment_point = None for i, src_word in enumerate(sentence_pair.mots): align_prob = self.translation_table[trg_word][src_word] if align_prob >= best_prob: # prefer newer word in case of tie best_prob = align_prob best_alignment_point = i best_alignment.append((j, best_alignment_point)) sentence_pair.alignment = Alignment(best_alignment) nltk-3.1/nltk/translate/ibm2.py0000644000076500000240000002776412607224144016246 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: IBM Model 2 # # Copyright (C) 2001-2013 NLTK Project # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim # URL: # For license information, see LICENSE.TXT """ Lexical translation model that considers word order. IBM Model 2 improves on Model 1 by accounting for word order. An alignment probability is introduced, a(i | j,l,m), which predicts a source word position, given its aligned target word's position. The EM algorithm used in Model 2 is: E step - In the training data, collect counts, weighted by prior probabilities. (a) count how many times a source language word is translated into a target language word (b) count how many times a particular position in the source sentence is aligned to a particular position in the target sentence M step - Estimate new probabilities based on the counts from the E step Notations: i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence l: Number of words in the source sentence, excluding NULL m: Number of words in the target sentence s: A word in the source language t: A word in the target language References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ from __future__ import division from collections import defaultdict from nltk.translate import AlignedSent from nltk.translate import Alignment from nltk.translate import IBMModel from nltk.translate import IBMModel1 from nltk.translate.ibm_model import Counts import warnings class IBMModel2(IBMModel): """ Lexical translation model that considers word order >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'ist', 'ja', 'groß'], ['the', 'house', 'is', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> ibm2 = IBMModel2(bitext, 5) >>> print(round(ibm2.translation_table['buch']['book'], 3)) 1.0 >>> print(round(ibm2.translation_table['das']['book'], 3)) 0.0 >>> print(round(ibm2.translation_table['buch'][None], 3)) 0.0 >>> print(round(ibm2.translation_table['ja'][None], 3)) 0.0 >>> print(ibm2.alignment_table[1][1][2][2]) 0.938... >>> print(round(ibm2.alignment_table[1][2][2][2], 3)) 0.0 >>> print(round(ibm2.alignment_table[2][2][4][5], 3)) 1.0 >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, 2), (4, 3)]) """ def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model and an alignment model. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, all the following entries must be present: ``translation_table``, ``alignment_table``. See ``IBMModel`` for the type and purpose of these tables. :type probability_tables: dict[str]: object """ super(IBMModel2, self).__init__(sentence_aligned_corpus) if probability_tables is None: # Get translation probabilities from IBM Model 1 # Run more iterations of training for Model 1, since it is # faster than Model 2 ibm1 = IBMModel1(sentence_aligned_corpus, 2 * iterations) self.translation_table = ibm1.translation_table self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables['translation_table'] self.alignment_table = probability_tables['alignment_table'] for n in range(0, iterations): self.train(sentence_aligned_corpus) self.__align_all(sentence_aligned_corpus) def set_uniform_probabilities(self, sentence_aligned_corpus): # a(i | j,l,m) = 1 / (l+1) for all i, j, l, m l_m_combinations = set() for aligned_sentence in sentence_aligned_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) if (l, m) not in l_m_combinations: l_m_combinations.add((l, m)) initial_prob = 1 / float(l + 1) if initial_prob < IBMModel.MIN_PROB: warnings.warn("A source sentence is too long (" + str(l) + " words). Results may be less accurate.") for i in range(0, l + 1): for j in range(1, m + 1): self.alignment_table[i][j][l][m] = initial_prob def train(self, parallel_corpus): counts = Model2Counts() for aligned_sentence in parallel_corpus: src_sentence = [None] + aligned_sentence.mots trg_sentence = ['UNUSED'] + aligned_sentence.words # 1-indexed l = len(aligned_sentence.mots) m = len(aligned_sentence.words) # E step (a): Compute normalization factors to weigh counts total_count = self.prob_all_alignments(src_sentence, trg_sentence) # E step (b): Collect counts for j in range(1, m + 1): t = trg_sentence[j] for i in range(0, l + 1): s = src_sentence[i] count = self.prob_alignment_point( i, j, src_sentence, trg_sentence) normalized_count = count / total_count[t] counts.update_lexical_translation(normalized_count, s, t) counts.update_alignment(normalized_count, i, j, l, m) # M step: Update probabilities with maximum likelihood estimates self.maximize_lexical_translation_probabilities(counts) self.maximize_alignment_probabilities(counts) def maximize_alignment_probabilities(self, counts): MIN_PROB = IBMModel.MIN_PROB for i, j_s in counts.alignment.items(): for j, src_sentence_lengths in j_s.items(): for l, trg_sentence_lengths in src_sentence_lengths.items(): for m in trg_sentence_lengths: estimate = (counts.alignment[i][j][l][m] / counts.alignment_for_any_i[j][l][m]) self.alignment_table[i][j][l][m] = max(estimate, MIN_PROB) def prob_all_alignments(self, src_sentence, trg_sentence): """ Computes the probability of all possible word alignments, expressed as a marginal distribution over target words t Each entry in the return value represents the contribution to the total alignment probability by the target word t. To obtain probability(alignment | src_sentence, trg_sentence), simply sum the entries in the return value. :return: Probability of t for all s in ``src_sentence`` :rtype: dict(str): float """ alignment_prob_for_t = defaultdict(lambda: 0.0) for j in range(1, len(trg_sentence)): t = trg_sentence[j] for i in range(0, len(src_sentence)): alignment_prob_for_t[t] += self.prob_alignment_point( i, j, src_sentence, trg_sentence) return alignment_prob_for_t def prob_alignment_point(self, i, j, src_sentence, trg_sentence): """ Probability that position j in ``trg_sentence`` is aligned to position i in the ``src_sentence`` """ l = len(src_sentence) - 1 m = len(trg_sentence) - 1 s = src_sentence[i] t = trg_sentence[j] return self.translation_table[t][s] * self.alignment_table[i][j][l][m] def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ prob = 1.0 l = len(alignment_info.src_sentence) - 1 m = len(alignment_info.trg_sentence) - 1 for j, i in enumerate(alignment_info.alignment): if j == 0: continue # skip the dummy zeroeth element trg_word = alignment_info.trg_sentence[j] src_word = alignment_info.src_sentence[i] prob *= (self.translation_table[trg_word][src_word] * self.alignment_table[i][j][l][m]) return max(prob, IBMModel.MIN_PROB) def __align_all(self, parallel_corpus): for sentence_pair in parallel_corpus: self.__align(sentence_pair) def __align(self, sentence_pair): """ Determines the best word alignment for one sentence pair from the corpus that the model was trained on. The best alignment will be set in ``sentence_pair`` when the method returns. In contrast with the internal implementation of IBM models, the word indices in the ``Alignment`` are zero- indexed, not one-indexed. :param sentence_pair: A sentence in the source language and its counterpart sentence in the target language :type sentence_pair: AlignedSent """ best_alignment = [] l = len(sentence_pair.mots) m = len(sentence_pair.words) for j, trg_word in enumerate(sentence_pair.words): # Initialize trg_word to align with the NULL token best_prob = (self.translation_table[trg_word][None] * self.alignment_table[0][j + 1][l][m]) best_prob = max(best_prob, IBMModel.MIN_PROB) best_alignment_point = None for i, src_word in enumerate(sentence_pair.mots): align_prob = (self.translation_table[trg_word][src_word] * self.alignment_table[i + 1][j + 1][l][m]) if align_prob >= best_prob: best_prob = align_prob best_alignment_point = i best_alignment.append((j, best_alignment_point)) sentence_pair.alignment = Alignment(best_alignment) class Model2Counts(Counts): """ Data object to store counts of various parameters during training. Includes counts for alignment. """ def __init__(self): super(Model2Counts, self).__init__() self.alignment = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: 0.0)))) self.alignment_for_any_i = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) def update_lexical_translation(self, count, s, t): self.t_given_s[t][s] += count self.any_t_given_s[s] += count def update_alignment(self, count, i, j, l, m): self.alignment[i][j][l][m] += count self.alignment_for_any_i[j][l][m] += count nltk-3.1/nltk/translate/ibm3.py0000644000076500000240000003306312607224144016234 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: IBM Model 3 # # Copyright (C) 2001-2013 NLTK Project # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim # URL: # For license information, see LICENSE.TXT """ Translation model that considers how a word can be aligned to multiple words in another language. IBM Model 3 improves on Model 2 by directly modeling the phenomenon where a word in one language may be translated into zero or more words in another. This is expressed by the fertility probability, n(phi | source word). If a source word translates into more than one word, it is possible to generate sentences that have the same alignment in multiple ways. This is modeled by a distortion step. The distortion probability, d(j|i,l,m), predicts a target word position, given its aligned source word's position. The distortion probability replaces the alignment probability of Model 2. The fertility probability is not applicable for NULL. Target words that align to NULL are assumed to be distributed uniformly in the target sentence. The existence of these words is modeled by p1, the probability that a target word produced by a real source word requires another target word that is produced by NULL. The EM algorithm used in Model 3 is: E step - In the training data, collect counts, weighted by prior probabilities. (a) count how many times a source language word is translated into a target language word (b) count how many times a particular position in the target sentence is aligned to a particular position in the source sentence (c) count how many times a source word is aligned to phi number of target words (d) count how many times NULL is aligned to a target word M step - Estimate new probabilities based on the counts from the E step Because there are too many possible alignments, only the most probable ones are considered. First, the best alignment is determined using prior probabilities. Then, a hill climbing approach is used to find other good candidates. Notations: i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence l: Number of words in the source sentence, excluding NULL m: Number of words in the target sentence s: A word in the source language t: A word in the target language phi: Fertility, the number of target words produced by a source word p1: Probability that a target word produced by a source word is accompanied by another target word that is aligned to NULL p0: 1 - p1 References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ from __future__ import division from collections import defaultdict from math import factorial from nltk.translate import AlignedSent from nltk.translate import Alignment from nltk.translate import IBMModel from nltk.translate import IBMModel2 from nltk.translate.ibm_model import Counts import warnings class IBMModel3(IBMModel): """ Translation model that considers how a word can be aligned to multiple words in another language >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) >>> ibm3 = IBMModel3(bitext, 5) >>> print(round(ibm3.translation_table['buch']['book'], 3)) 1.0 >>> print(round(ibm3.translation_table['das']['book'], 3)) 0.0 >>> print(round(ibm3.translation_table['ja'][None], 3)) 1.0 >>> print(round(ibm3.distortion_table[1][1][2][2], 3)) 1.0 >>> print(round(ibm3.distortion_table[1][2][2][2], 3)) 0.0 >>> print(round(ibm3.distortion_table[2][2][4][5], 3)) 0.75 >>> print(round(ibm3.fertility_table[2]['summarize'], 3)) 1.0 >>> print(round(ibm3.fertility_table[1]['book'], 3)) 1.0 >>> print(ibm3.p1) 0.054... >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) """ def __init__(self, sentence_aligned_corpus, iterations, probability_tables=None): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model, a distortion model, a fertility model, and a model for generating NULL-aligned words. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, all the following entries must be present: ``translation_table``, ``alignment_table``, ``fertility_table``, ``p1``, ``distortion_table``. See ``IBMModel`` for the type and purpose of these tables. :type probability_tables: dict[str]: object """ super(IBMModel3, self).__init__(sentence_aligned_corpus) self.reset_probabilities() if probability_tables is None: # Get translation and alignment probabilities from IBM Model 2 ibm2 = IBMModel2(sentence_aligned_corpus, iterations) self.translation_table = ibm2.translation_table self.alignment_table = ibm2.alignment_table self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables['translation_table'] self.alignment_table = probability_tables['alignment_table'] self.fertility_table = probability_tables['fertility_table'] self.p1 = probability_tables['p1'] self.distortion_table = probability_tables['distortion_table'] for n in range(0, iterations): self.train(sentence_aligned_corpus) def reset_probabilities(self): super(IBMModel3, self).reset_probabilities() self.distortion_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: self.MIN_PROB)))) """ dict[int][int][int][int]: float. Probability(j | i,l,m). Values accessed as ``distortion_table[j][i][l][m]``. """ def set_uniform_probabilities(self, sentence_aligned_corpus): # d(j | i,l,m) = 1 / m for all i, j, l, m l_m_combinations = set() for aligned_sentence in sentence_aligned_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) if (l, m) not in l_m_combinations: l_m_combinations.add((l, m)) initial_prob = 1 / float(m) if initial_prob < IBMModel.MIN_PROB: warnings.warn("A target sentence is too long (" + str(m) + " words). Results may be less accurate.") for j in range(1, m + 1): for i in range(0, l + 1): self.distortion_table[j][i][l][m] = initial_prob # simple initialization, taken from GIZA++ self.fertility_table[0] = defaultdict(lambda: 0.2) self.fertility_table[1] = defaultdict(lambda: 0.65) self.fertility_table[2] = defaultdict(lambda: 0.1) self.fertility_table[3] = defaultdict(lambda: 0.04) MAX_FERTILITY = 10 initial_fert_prob = 0.01 / (MAX_FERTILITY - 4) for phi in range(4, MAX_FERTILITY): self.fertility_table[phi] = defaultdict(lambda: initial_fert_prob) self.p1 = 0.5 def train(self, parallel_corpus): counts = Model3Counts() for aligned_sentence in parallel_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) # Sample the alignment space sampled_alignments, best_alignment = self.sample(aligned_sentence) # Record the most probable alignment aligned_sentence.alignment = Alignment( best_alignment.zero_indexed_alignment()) # E step (a): Compute normalization factors to weigh counts total_count = self.prob_of_alignments(sampled_alignments) # E step (b): Collect counts for alignment_info in sampled_alignments: count = self.prob_t_a_given_s(alignment_info) normalized_count = count / total_count for j in range(1, m + 1): counts.update_lexical_translation( normalized_count, alignment_info, j) counts.update_distortion( normalized_count, alignment_info, j, l, m) counts.update_null_generation(normalized_count, alignment_info) counts.update_fertility(normalized_count, alignment_info) # M step: Update probabilities with maximum likelihood estimates # If any probability is less than MIN_PROB, clamp it to MIN_PROB existing_alignment_table = self.alignment_table self.reset_probabilities() self.alignment_table = existing_alignment_table # don't retrain self.maximize_lexical_translation_probabilities(counts) self.maximize_distortion_probabilities(counts) self.maximize_fertility_probabilities(counts) self.maximize_null_generation_probabilities(counts) def maximize_distortion_probabilities(self, counts): MIN_PROB = IBMModel.MIN_PROB for j, i_s in counts.distortion.items(): for i, src_sentence_lengths in i_s.items(): for l, trg_sentence_lengths in src_sentence_lengths.items(): for m in trg_sentence_lengths: estimate = (counts.distortion[j][i][l][m] / counts.distortion_for_any_j[i][l][m]) self.distortion_table[j][i][l][m] = max(estimate, MIN_PROB) def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ src_sentence = alignment_info.src_sentence trg_sentence = alignment_info.trg_sentence l = len(src_sentence) - 1 # exclude NULL m = len(trg_sentence) - 1 p1 = self.p1 p0 = 1 - p1 probability = 1.0 MIN_PROB = IBMModel.MIN_PROB # Combine NULL insertion probability null_fertility = alignment_info.fertility_of_i(0) probability *= (pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)) if probability < MIN_PROB: return MIN_PROB # Compute combination (m - null_fertility) choose null_fertility for i in range(1, null_fertility + 1): probability *= (m - null_fertility - i + 1) / i if probability < MIN_PROB: return MIN_PROB # Combine fertility probabilities for i in range(1, l + 1): fertility = alignment_info.fertility_of_i(i) probability *= (factorial(fertility) * self.fertility_table[fertility][src_sentence[i]]) if probability < MIN_PROB: return MIN_PROB # Combine lexical and distortion probabilities for j in range(1, m + 1): t = trg_sentence[j] i = alignment_info.alignment[j] s = src_sentence[i] probability *= (self.translation_table[t][s] * self.distortion_table[j][i][l][m]) if probability < MIN_PROB: return MIN_PROB return probability class Model3Counts(Counts): """ Data object to store counts of various parameters during training. Includes counts for distortion. """ def __init__(self): super(Model3Counts, self).__init__() self.distortion = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: 0.0)))) self.distortion_for_any_j = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) def update_distortion(self, count, alignment_info, j, l, m): i = alignment_info.alignment[j] self.distortion[j][i][l][m] += count self.distortion_for_any_j[i][l][m] += count nltk-3.1/nltk/translate/ibm4.py0000644000076500000240000004740512607224144016242 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: IBM Model 4 # # Copyright (C) 2001-2015 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Translation model that reorders output words based on their type and distance from other related words in the output sentence. IBM Model 4 improves the distortion model of Model 3, motivated by the observation that certain words tend to be re-ordered in a predictable way relative to one another. For example, in English usually has its order flipped as in French. Model 4 requires words in the source and target vocabularies to be categorized into classes. This can be linguistically driven, like parts of speech (adjective, nouns, prepositions, etc). Word classes can also be obtained by statistical methods. The original IBM Model 4 uses an information theoretic approach to group words into 50 classes for each vocabulary. Terminology: Cept: A source word with non-zero fertility i.e. aligned to one or more target words. Tablet: The set of target word(s) aligned to a cept. Head of cept: The first word of the tablet of that cept. Center of cept: The average position of the words in that cept's tablet. If the value is not an integer, the ceiling is taken. For example, for a tablet with words in positions 2, 5, 6 in the target sentence, the center of the corresponding cept is ceil((2 + 5 + 6) / 3) = 5 Displacement: For a head word, defined as (position of head word - position of previous cept's center). Can be positive or negative. For a non-head word, defined as (position of non-head word - position of previous word in the same tablet). Always positive, because successive words in a tablet are assumed to appear to the right of the previous word. In contrast to Model 3 which reorders words in a tablet independently of other words, Model 4 distinguishes between three cases. (1) Words generated by NULL are distributed uniformly. (2) For a head word t, its position is modeled by the probability d_head(displacement | word_class_s(s),word_class_t(t)), where s is the previous cept, and word_class_s and word_class_t maps s and t to a source and target language word class respectively. (3) For a non-head word t, its position is modeled by the probability d_non_head(displacement | word_class_t(t)) The EM algorithm used in Model 4 is: E step - In the training data, collect counts, weighted by prior probabilities. (a) count how many times a source language word is translated into a target language word (b) for a particular word class, count how many times a head word is located at a particular displacement from the previous cept's center (c) for a particular word class, count how many times a non-head word is located at a particular displacement from the previous target word (d) count how many times a source word is aligned to phi number of target words (e) count how many times NULL is aligned to a target word M step - Estimate new probabilities based on the counts from the E step Like Model 3, there are too many possible alignments to consider. Thus, a hill climbing approach is used to sample good candidates. Notations: i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence l: Number of words in the source sentence, excluding NULL m: Number of words in the target sentence s: A word in the source language t: A word in the target language phi: Fertility, the number of target words produced by a source word p1: Probability that a target word produced by a source word is accompanied by another target word that is aligned to NULL p0: 1 - p1 dj: Displacement, Δj References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ from __future__ import division from collections import defaultdict from math import factorial from nltk.translate import AlignedSent from nltk.translate import Alignment from nltk.translate import IBMModel from nltk.translate import IBMModel3 from nltk.translate.ibm_model import Counts from nltk.translate.ibm_model import longest_target_sentence_length import warnings class IBMModel4(IBMModel): """ Translation model that reorders output words based on their type and their distance from other related words in the output sentence >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 } >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 } >>> ibm4 = IBMModel4(bitext, 5, src_classes, trg_classes) >>> print(round(ibm4.translation_table['buch']['book'], 3)) 1.0 >>> print(round(ibm4.translation_table['das']['book'], 3)) 0.0 >>> print(round(ibm4.translation_table['ja'][None], 3)) 1.0 >>> print(round(ibm4.head_distortion_table[1][0][1], 3)) 1.0 >>> print(round(ibm4.head_distortion_table[2][0][1], 3)) 0.0 >>> print(round(ibm4.non_head_distortion_table[3][6], 3)) 0.5 >>> print(round(ibm4.fertility_table[2]['summarize'], 3)) 1.0 >>> print(round(ibm4.fertility_table[1]['book'], 3)) 1.0 >>> print(ibm4.p1) 0.033... >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) """ def __init__(self, sentence_aligned_corpus, iterations, source_word_classes, target_word_classes, probability_tables=None): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model, distortion models, a fertility model, and a model for generating NULL-aligned words. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param source_word_classes: Lookup table that maps a source word to its word class, the latter represented by an integer id :type source_word_classes: dict[str]: int :param target_word_classes: Lookup table that maps a target word to its word class, the latter represented by an integer id :type target_word_classes: dict[str]: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, all the following entries must be present: ``translation_table``, ``alignment_table``, ``fertility_table``, ``p1``, ``head_distortion_table``, ``non_head_distortion_table``. See ``IBMModel`` and ``IBMModel4`` for the type and purpose of these tables. :type probability_tables: dict[str]: object """ super(IBMModel4, self).__init__(sentence_aligned_corpus) self.reset_probabilities() self.src_classes = source_word_classes self.trg_classes = target_word_classes if probability_tables is None: # Get probabilities from IBM model 3 ibm3 = IBMModel3(sentence_aligned_corpus, iterations) self.translation_table = ibm3.translation_table self.alignment_table = ibm3.alignment_table self.fertility_table = ibm3.fertility_table self.p1 = ibm3.p1 self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables['translation_table'] self.alignment_table = probability_tables['alignment_table'] self.fertility_table = probability_tables['fertility_table'] self.p1 = probability_tables['p1'] self.head_distortion_table = probability_tables[ 'head_distortion_table'] self.non_head_distortion_table = probability_tables[ 'non_head_distortion_table'] for n in range(0, iterations): self.train(sentence_aligned_corpus) def reset_probabilities(self): super(IBMModel4, self).reset_probabilities() self.head_distortion_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))) """ dict[int][int][int]: float. Probability(displacement of head word | word class of previous cept,target word class). Values accessed as ``distortion_table[dj][src_class][trg_class]``. """ self.non_head_distortion_table = defaultdict( lambda: defaultdict(lambda: self.MIN_PROB)) """ dict[int][int]: float. Probability(displacement of non-head word | target word class). Values accessed as ``distortion_table[dj][trg_class]``. """ def set_uniform_probabilities(self, sentence_aligned_corpus): """ Set distortion probabilities uniformly to 1 / cardinality of displacement values """ max_m = longest_target_sentence_length(sentence_aligned_corpus) # The maximum displacement is m-1, when a word is in the last # position m of the target sentence and the previously placed # word is in the first position. # Conversely, the minimum displacement is -(m-1). # Thus, the displacement range is (m-1) - (-(m-1)). Note that # displacement cannot be zero and is not included in the range. if max_m <= 1: initial_prob = IBMModel.MIN_PROB else: initial_prob = float(1) / (2 * (max_m - 1)) if initial_prob < IBMModel.MIN_PROB: warnings.warn("A target sentence is too long (" + str(max_m) + " words). Results may be less accurate.") for dj in range(1, max_m): self.head_distortion_table[dj] = defaultdict( lambda: defaultdict(lambda: initial_prob)) self.head_distortion_table[-dj] = defaultdict( lambda: defaultdict(lambda: initial_prob)) self.non_head_distortion_table[dj] = defaultdict( lambda: initial_prob) self.non_head_distortion_table[-dj] = defaultdict( lambda: initial_prob) def train(self, parallel_corpus): counts = Model4Counts() for aligned_sentence in parallel_corpus: m = len(aligned_sentence.words) # Sample the alignment space sampled_alignments, best_alignment = self.sample(aligned_sentence) # Record the most probable alignment aligned_sentence.alignment = Alignment( best_alignment.zero_indexed_alignment()) # E step (a): Compute normalization factors to weigh counts total_count = self.prob_of_alignments(sampled_alignments) # E step (b): Collect counts for alignment_info in sampled_alignments: count = self.prob_t_a_given_s(alignment_info) normalized_count = count / total_count for j in range(1, m + 1): counts.update_lexical_translation( normalized_count, alignment_info, j) counts.update_distortion( normalized_count, alignment_info, j, self.src_classes, self.trg_classes) counts.update_null_generation(normalized_count, alignment_info) counts.update_fertility(normalized_count, alignment_info) # M step: Update probabilities with maximum likelihood estimates # If any probability is less than MIN_PROB, clamp it to MIN_PROB existing_alignment_table = self.alignment_table self.reset_probabilities() self.alignment_table = existing_alignment_table # don't retrain self.maximize_lexical_translation_probabilities(counts) self.maximize_distortion_probabilities(counts) self.maximize_fertility_probabilities(counts) self.maximize_null_generation_probabilities(counts) def maximize_distortion_probabilities(self, counts): head_d_table = self.head_distortion_table for dj, src_classes in counts.head_distortion.items(): for s_cls, trg_classes in src_classes.items(): for t_cls in trg_classes: estimate = (counts.head_distortion[dj][s_cls][t_cls] / counts.head_distortion_for_any_dj[s_cls][t_cls]) head_d_table[dj][s_cls][t_cls] = max(estimate, IBMModel.MIN_PROB) non_head_d_table = self.non_head_distortion_table for dj, trg_classes in counts.non_head_distortion.items(): for t_cls in trg_classes: estimate = (counts.non_head_distortion[dj][t_cls] / counts.non_head_distortion_for_any_dj[t_cls]) non_head_d_table[dj][t_cls] = max(estimate, IBMModel.MIN_PROB) def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ return IBMModel4.model4_prob_t_a_given_s(alignment_info, self) @staticmethod # exposed for Model 5 to use def model4_prob_t_a_given_s(alignment_info, ibm_model): probability = 1.0 MIN_PROB = IBMModel.MIN_PROB def null_generation_term(): # Binomial distribution: B(m - null_fertility, p1) value = 1.0 p1 = ibm_model.p1 p0 = 1 - p1 null_fertility = alignment_info.fertility_of_i(0) m = len(alignment_info.trg_sentence) - 1 value *= (pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)) if value < MIN_PROB: return MIN_PROB # Combination: (m - null_fertility) choose null_fertility for i in range(1, null_fertility + 1): value *= (m - null_fertility - i + 1) / i return value def fertility_term(): value = 1.0 src_sentence = alignment_info.src_sentence for i in range(1, len(src_sentence)): fertility = alignment_info.fertility_of_i(i) value *= (factorial(fertility) * ibm_model.fertility_table[fertility][src_sentence[i]]) if value < MIN_PROB: return MIN_PROB return value def lexical_translation_term(j): t = alignment_info.trg_sentence[j] i = alignment_info.alignment[j] s = alignment_info.src_sentence[i] return ibm_model.translation_table[t][s] def distortion_term(j): t = alignment_info.trg_sentence[j] i = alignment_info.alignment[j] if i == 0: # case 1: t is aligned to NULL return 1.0 if alignment_info.is_head_word(j): # case 2: t is the first word of a tablet previous_cept = alignment_info.previous_cept(j) src_class = None if previous_cept is not None: previous_s = alignment_info.src_sentence[previous_cept] src_class = ibm_model.src_classes[previous_s] trg_class = ibm_model.trg_classes[t] dj = j - alignment_info.center_of_cept(previous_cept) return ibm_model.head_distortion_table[dj][src_class][trg_class] # case 3: t is a subsequent word of a tablet previous_position = alignment_info.previous_in_tablet(j) trg_class = ibm_model.trg_classes[t] dj = j - previous_position return ibm_model.non_head_distortion_table[dj][trg_class] # end nested functions # Abort computation whenever probability falls below MIN_PROB at # any point, since MIN_PROB can be considered as zero probability *= null_generation_term() if probability < MIN_PROB: return MIN_PROB probability *= fertility_term() if probability < MIN_PROB: return MIN_PROB for j in range(1, len(alignment_info.trg_sentence)): probability *= lexical_translation_term(j) if probability < MIN_PROB: return MIN_PROB probability *= distortion_term(j) if probability < MIN_PROB: return MIN_PROB return probability class Model4Counts(Counts): """ Data object to store counts of various parameters during training. Includes counts for distortion. """ def __init__(self): super(Model4Counts, self).__init__() self.head_distortion = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) self.head_distortion_for_any_dj = defaultdict( lambda: defaultdict(lambda: 0.0)) self.non_head_distortion = defaultdict( lambda: defaultdict(lambda: 0.0)) self.non_head_distortion_for_any_dj = defaultdict(lambda: 0.0) def update_distortion(self, count, alignment_info, j, src_classes, trg_classes): i = alignment_info.alignment[j] t = alignment_info.trg_sentence[j] if i == 0: # case 1: t is aligned to NULL pass elif alignment_info.is_head_word(j): # case 2: t is the first word of a tablet previous_cept = alignment_info.previous_cept(j) if previous_cept is not None: previous_src_word = alignment_info.src_sentence[previous_cept] src_class = src_classes[previous_src_word] else: src_class = None trg_class = trg_classes[t] dj = j - alignment_info.center_of_cept(previous_cept) self.head_distortion[dj][src_class][trg_class] += count self.head_distortion_for_any_dj[src_class][trg_class] += count else: # case 3: t is a subsequent word of a tablet previous_j = alignment_info.previous_in_tablet(j) trg_class = trg_classes[t] dj = j - previous_j self.non_head_distortion[dj][trg_class] += count self.non_head_distortion_for_any_dj[trg_class] += count nltk-3.1/nltk/translate/ibm5.py0000644000076500000240000006541012607224144016237 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: IBM Model 5 # # Copyright (C) 2001-2015 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Translation model that keeps track of vacant positions in the target sentence to decide where to place translated words. Translation can be viewed as a process where each word in the source sentence is stepped through sequentially, generating translated words for each source word. The target sentence can be viewed as being made up of ``m`` empty slots initially, which gradually fill up as generated words are placed in them. Models 3 and 4 use distortion probabilities to decide how to place translated words. For simplicity, these models ignore the history of which slots have already been occupied with translated words. Consider the placement of the last translated word: there is only one empty slot left in the target sentence, so the distortion probability should be 1.0 for that position and 0.0 everywhere else. However, the distortion probabilities for Models 3 and 4 are set up such that all positions are under consideration. IBM Model 5 fixes this deficiency by accounting for occupied slots during translation. It introduces the vacancy function v(j), the number of vacancies up to, and including, position j in the target sentence. Terminology: Maximum vacancy: The number of valid slots that a word can be placed in. This is not necessarily the same as the number of vacant slots. For example, if a tablet contains more than one word, the head word cannot be placed at the last vacant slot because there will be no space for the other words in the tablet. The number of valid slots has to take into account the length of the tablet. Non-head words cannot be placed before the head word, so vacancies to the left of the head word are ignored. Vacancy difference: For a head word: (v(j) - v(center of previous cept)) Can be positive or negative. For a non-head word: (v(j) - v(position of previously placed word)) Always positive, because successive words in a tablet are assumed to appear to the right of the previous word. Positioning of target words fall under three cases: (1) Words generated by NULL are distributed uniformly (2) For a head word t, its position is modeled by the probability v_head(dv | max_v,word_class_t(t)) (3) For a non-head word t, its position is modeled by the probability v_non_head(dv | max_v,word_class_t(t)) dv and max_v are defined differently for head and non-head words. The EM algorithm used in Model 5 is: E step - In the training data, collect counts, weighted by prior probabilities. (a) count how many times a source language word is translated into a target language word (b) for a particular word class and maximum vacancy, count how many times a head word and the previous cept's center have a particular difference in number of vacancies (b) for a particular word class and maximum vacancy, count how many times a non-head word and the previous target word have a particular difference in number of vacancies (d) count how many times a source word is aligned to phi number of target words (e) count how many times NULL is aligned to a target word M step - Estimate new probabilities based on the counts from the E step Like Model 4, there are too many possible alignments to consider. Thus, a hill climbing approach is used to sample good candidates. In addition, pruning is used to weed out unlikely alignments based on Model 4 scores. Notations: i: Position in the source sentence Valid values are 0 (for NULL), 1, 2, ..., length of source sentence j: Position in the target sentence Valid values are 1, 2, ..., length of target sentence l: Number of words in the source sentence, excluding NULL m: Number of words in the target sentence s: A word in the source language t: A word in the target language phi: Fertility, the number of target words produced by a source word p1: Probability that a target word produced by a source word is accompanied by another target word that is aligned to NULL p0: 1 - p1 max_v: Maximum vacancy dv: Vacancy difference, Δv The definition of v_head here differs from GIZA++, section 4.7 of [Brown et al., 1993], and [Koehn, 2010]. In the latter cases, v_head is v_head(v(j) | v(center of previous cept),max_v,word_class(t)). Here, we follow appendix B of [Brown et al., 1993] and combine v(j) with v(center of previous cept) to obtain dv: v_head(v(j) - v(center of previous cept) | max_v,word_class(t)). References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ from __future__ import division from collections import defaultdict from math import factorial from nltk.translate import AlignedSent from nltk.translate import Alignment from nltk.translate import IBMModel from nltk.translate import IBMModel4 from nltk.translate.ibm_model import Counts from nltk.translate.ibm_model import longest_target_sentence_length import warnings class IBMModel5(IBMModel): """ Translation model that keeps track of vacant positions in the target sentence to decide where to place translated words >>> bitext = [] >>> bitext.append(AlignedSent(['klein', 'ist', 'das', 'haus'], ['the', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus', 'war', 'ja', 'groß'], ['the', 'house', 'was', 'big'])) >>> bitext.append(AlignedSent(['das', 'buch', 'ist', 'ja', 'klein'], ['the', 'book', 'is', 'small'])) >>> bitext.append(AlignedSent(['ein', 'haus', 'ist', 'klein'], ['a', 'house', 'is', 'small'])) >>> bitext.append(AlignedSent(['das', 'haus'], ['the', 'house'])) >>> bitext.append(AlignedSent(['das', 'buch'], ['the', 'book'])) >>> bitext.append(AlignedSent(['ein', 'buch'], ['a', 'book'])) >>> bitext.append(AlignedSent(['ich', 'fasse', 'das', 'buch', 'zusammen'], ['i', 'summarize', 'the', 'book'])) >>> bitext.append(AlignedSent(['fasse', 'zusammen'], ['summarize'])) >>> src_classes = {'the': 0, 'a': 0, 'small': 1, 'big': 1, 'house': 2, 'book': 2, 'is': 3, 'was': 3, 'i': 4, 'summarize': 5 } >>> trg_classes = {'das': 0, 'ein': 0, 'haus': 1, 'buch': 1, 'klein': 2, 'groß': 2, 'ist': 3, 'war': 3, 'ja': 4, 'ich': 5, 'fasse': 6, 'zusammen': 6 } >>> ibm5 = IBMModel5(bitext, 5, src_classes, trg_classes) >>> print(round(ibm5.head_vacancy_table[1][1][1], 3)) 1.0 >>> print(round(ibm5.head_vacancy_table[2][1][1], 3)) 0.0 >>> print(round(ibm5.non_head_vacancy_table[3][3][6], 3)) 1.0 >>> print(round(ibm5.fertility_table[2]['summarize'], 3)) 1.0 >>> print(round(ibm5.fertility_table[1]['book'], 3)) 1.0 >>> print(ibm5.p1) 0.033... >>> test_sentence = bitext[2] >>> test_sentence.words ['das', 'buch', 'ist', 'ja', 'klein'] >>> test_sentence.mots ['the', 'book', 'is', 'small'] >>> test_sentence.alignment Alignment([(0, 0), (1, 1), (2, 2), (3, None), (4, 3)]) """ MIN_SCORE_FACTOR = 0.2 """ Alignments with scores below this factor are pruned during sampling """ def __init__(self, sentence_aligned_corpus, iterations, source_word_classes, target_word_classes, probability_tables=None): """ Train on ``sentence_aligned_corpus`` and create a lexical translation model, vacancy models, a fertility model, and a model for generating NULL-aligned words. Translation direction is from ``AlignedSent.mots`` to ``AlignedSent.words``. :param sentence_aligned_corpus: Sentence-aligned parallel corpus :type sentence_aligned_corpus: list(AlignedSent) :param iterations: Number of iterations to run training algorithm :type iterations: int :param source_word_classes: Lookup table that maps a source word to its word class, the latter represented by an integer id :type source_word_classes: dict[str]: int :param target_word_classes: Lookup table that maps a target word to its word class, the latter represented by an integer id :type target_word_classes: dict[str]: int :param probability_tables: Optional. Use this to pass in custom probability values. If not specified, probabilities will be set to a uniform distribution, or some other sensible value. If specified, all the following entries must be present: ``translation_table``, ``alignment_table``, ``fertility_table``, ``p1``, ``head_distortion_table``, ``non_head_distortion_table``, ``head_vacancy_table``, ``non_head_vacancy_table``. See ``IBMModel``, ``IBMModel4``, and ``IBMModel5`` for the type and purpose of these tables. :type probability_tables: dict[str]: object """ super(IBMModel5, self).__init__(sentence_aligned_corpus) self.reset_probabilities() self.src_classes = source_word_classes self.trg_classes = target_word_classes if probability_tables is None: # Get probabilities from IBM model 4 ibm4 = IBMModel4(sentence_aligned_corpus, iterations, source_word_classes, target_word_classes) self.translation_table = ibm4.translation_table self.alignment_table = ibm4.alignment_table self.fertility_table = ibm4.fertility_table self.p1 = ibm4.p1 self.head_distortion_table = ibm4.head_distortion_table self.non_head_distortion_table = ibm4.non_head_distortion_table self.set_uniform_probabilities(sentence_aligned_corpus) else: # Set user-defined probabilities self.translation_table = probability_tables['translation_table'] self.alignment_table = probability_tables['alignment_table'] self.fertility_table = probability_tables['fertility_table'] self.p1 = probability_tables['p1'] self.head_distortion_table = probability_tables[ 'head_distortion_table'] self.non_head_distortion_table = probability_tables[ 'non_head_distortion_table'] self.head_vacancy_table = probability_tables[ 'head_vacancy_table'] self.non_head_vacancy_table = probability_tables[ 'non_head_vacancy_table'] for n in range(0, iterations): self.train(sentence_aligned_corpus) def reset_probabilities(self): super(IBMModel5, self).reset_probabilities() self.head_vacancy_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))) """ dict[int][int][int]: float. Probability(vacancy difference | number of remaining valid positions,target word class). Values accessed as ``head_vacancy_table[dv][v_max][trg_class]``. """ self.non_head_vacancy_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: self.MIN_PROB))) """ dict[int][int][int]: float. Probability(vacancy difference | number of remaining valid positions,target word class). Values accessed as ``non_head_vacancy_table[dv][v_max][trg_class]``. """ def set_uniform_probabilities(self, sentence_aligned_corpus): """ Set vacancy probabilities uniformly to 1 / cardinality of vacancy difference values """ max_m = longest_target_sentence_length(sentence_aligned_corpus) # The maximum vacancy difference occurs when a word is placed in # the last available position m of the target sentence and the # previous word position has no vacancies. # The minimum is 1-max_v, when a word is placed in the first # available position and the previous word is placed beyond the # last available position. # Thus, the number of possible vacancy difference values is # (max_v) - (1-max_v) + 1 = 2 * max_v. if max_m > 0 and (float(1) / (2 * max_m)) < IBMModel.MIN_PROB: warnings.warn("A target sentence is too long (" + str(max_m) + " words). Results may be less accurate.") for max_v in range(1, max_m + 1): for dv in range(1, max_m + 1): initial_prob = 1 / (2 * max_v) self.head_vacancy_table[dv][max_v] = defaultdict( lambda: initial_prob) self.head_vacancy_table[-(dv-1)][max_v] = defaultdict( lambda: initial_prob) self.non_head_vacancy_table[dv][max_v] = defaultdict( lambda: initial_prob) self.non_head_vacancy_table[-(dv-1)][max_v] = defaultdict( lambda: initial_prob) def train(self, parallel_corpus): counts = Model5Counts() for aligned_sentence in parallel_corpus: l = len(aligned_sentence.mots) m = len(aligned_sentence.words) # Sample the alignment space sampled_alignments, best_alignment = self.sample(aligned_sentence) # Record the most probable alignment aligned_sentence.alignment = Alignment( best_alignment.zero_indexed_alignment()) # E step (a): Compute normalization factors to weigh counts total_count = self.prob_of_alignments(sampled_alignments) # E step (b): Collect counts for alignment_info in sampled_alignments: count = self.prob_t_a_given_s(alignment_info) normalized_count = count / total_count for j in range(1, m + 1): counts.update_lexical_translation( normalized_count, alignment_info, j) slots = Slots(m) for i in range(1, l + 1): counts.update_vacancy( normalized_count, alignment_info, i, self.trg_classes, slots) counts.update_null_generation(normalized_count, alignment_info) counts.update_fertility(normalized_count, alignment_info) # M step: Update probabilities with maximum likelihood estimates # If any probability is less than MIN_PROB, clamp it to MIN_PROB existing_alignment_table = self.alignment_table self.reset_probabilities() self.alignment_table = existing_alignment_table # don't retrain self.maximize_lexical_translation_probabilities(counts) self.maximize_vacancy_probabilities(counts) self.maximize_fertility_probabilities(counts) self.maximize_null_generation_probabilities(counts) def sample(self, sentence_pair): """ Sample the most probable alignments from the entire alignment space according to Model 4 Note that Model 4 scoring is used instead of Model 5 because the latter is too expensive to compute. First, determine the best alignment according to IBM Model 2. With this initial alignment, use hill climbing to determine the best alignment according to a IBM Model 4. Add this alignment and its neighbors to the sample set. Repeat this process with other initial alignments obtained by pegging an alignment point. Finally, prune alignments that have substantially lower Model 4 scores than the best alignment. :param sentence_pair: Source and target language sentence pair to generate a sample of alignments from :type sentence_pair: AlignedSent :return: A set of best alignments represented by their ``AlignmentInfo`` and the best alignment of the set for convenience :rtype: set(AlignmentInfo), AlignmentInfo """ sampled_alignments, best_alignment = super( IBMModel5, self).sample(sentence_pair) return self.prune(sampled_alignments), best_alignment def prune(self, alignment_infos): """ Removes alignments from ``alignment_infos`` that have substantially lower Model 4 scores than the best alignment :return: Pruned alignments :rtype: set(AlignmentInfo) """ alignments = [] best_score = 0 for alignment_info in alignment_infos: score = IBMModel4.model4_prob_t_a_given_s(alignment_info, self) best_score = max(score, best_score) alignments.append((alignment_info, score)) threshold = IBMModel5.MIN_SCORE_FACTOR * best_score alignments = [a[0] for a in alignments if a[1] > threshold] return set(alignments) def hillclimb(self, alignment_info, j_pegged=None): """ Starting from the alignment in ``alignment_info``, look at neighboring alignments iteratively for the best one, according to Model 4 Note that Model 4 scoring is used instead of Model 5 because the latter is too expensive to compute. There is no guarantee that the best alignment in the alignment space will be found, because the algorithm might be stuck in a local maximum. :param j_pegged: If specified, the search will be constrained to alignments where ``j_pegged`` remains unchanged :type j_pegged: int :return: The best alignment found from hill climbing :rtype: AlignmentInfo """ alignment = alignment_info # alias with shorter name max_probability = IBMModel4.model4_prob_t_a_given_s(alignment, self) while True: old_alignment = alignment for neighbor_alignment in self.neighboring(alignment, j_pegged): neighbor_probability = IBMModel4.model4_prob_t_a_given_s( neighbor_alignment, self) if neighbor_probability > max_probability: alignment = neighbor_alignment max_probability = neighbor_probability if alignment == old_alignment: # Until there are no better alignments break alignment.score = max_probability return alignment def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence """ probability = 1.0 MIN_PROB = IBMModel.MIN_PROB slots = Slots(len(alignment_info.trg_sentence) - 1) def null_generation_term(): # Binomial distribution: B(m - null_fertility, p1) value = 1.0 p1 = self.p1 p0 = 1 - p1 null_fertility = alignment_info.fertility_of_i(0) m = len(alignment_info.trg_sentence) - 1 value *= (pow(p1, null_fertility) * pow(p0, m - 2 * null_fertility)) if value < MIN_PROB: return MIN_PROB # Combination: (m - null_fertility) choose null_fertility for i in range(1, null_fertility + 1): value *= (m - null_fertility - i + 1) / i return value def fertility_term(): value = 1.0 src_sentence = alignment_info.src_sentence for i in range(1, len(src_sentence)): fertility = alignment_info.fertility_of_i(i) value *= (factorial(fertility) * self.fertility_table[fertility][src_sentence[i]]) if value < MIN_PROB: return MIN_PROB return value def lexical_translation_term(j): t = alignment_info.trg_sentence[j] i = alignment_info.alignment[j] s = alignment_info.src_sentence[i] return self.translation_table[t][s] def vacancy_term(i): value = 1.0 tablet = alignment_info.cepts[i] tablet_length = len(tablet) total_vacancies = slots.vacancies_at(len(slots)) # case 1: NULL-aligned words if tablet_length == 0: return value # case 2: head word j = tablet[0] previous_cept = alignment_info.previous_cept(j) previous_center = alignment_info.center_of_cept(previous_cept) dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center) max_v = total_vacancies - tablet_length + 1 trg_class = self.trg_classes[alignment_info.trg_sentence[j]] value *= self.head_vacancy_table[dv][max_v][trg_class] slots.occupy(j) # mark position as occupied total_vacancies -= 1 if value < MIN_PROB: return MIN_PROB # case 3: non-head words for k in range(1, tablet_length): previous_position = tablet[k - 1] previous_vacancies = slots.vacancies_at(previous_position) j = tablet[k] dv = slots.vacancies_at(j) - previous_vacancies max_v = (total_vacancies - tablet_length + k + 1 - previous_vacancies) trg_class = self.trg_classes[alignment_info.trg_sentence[j]] value *= self.non_head_vacancy_table[dv][max_v][trg_class] slots.occupy(j) # mark position as occupied total_vacancies -= 1 if value < MIN_PROB: return MIN_PROB return value # end nested functions # Abort computation whenever probability falls below MIN_PROB at # any point, since MIN_PROB can be considered as zero probability *= null_generation_term() if probability < MIN_PROB: return MIN_PROB probability *= fertility_term() if probability < MIN_PROB: return MIN_PROB for j in range(1, len(alignment_info.trg_sentence)): probability *= lexical_translation_term(j) if probability < MIN_PROB: return MIN_PROB for i in range(1, len(alignment_info.src_sentence)): probability *= vacancy_term(i) if probability < MIN_PROB: return MIN_PROB return probability def maximize_vacancy_probabilities(self, counts): MIN_PROB = IBMModel.MIN_PROB head_vacancy_table = self.head_vacancy_table for dv, max_vs in counts.head_vacancy.items(): for max_v, trg_classes in max_vs.items(): for t_cls in trg_classes: estimate = (counts.head_vacancy[dv][max_v][t_cls] / counts.head_vacancy_for_any_dv[max_v][t_cls]) head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB) non_head_vacancy_table = self.non_head_vacancy_table for dv, max_vs in counts.non_head_vacancy.items(): for max_v, trg_classes in max_vs.items(): for t_cls in trg_classes: estimate = ( counts.non_head_vacancy[dv][max_v][t_cls] / counts.non_head_vacancy_for_any_dv[max_v][t_cls]) non_head_vacancy_table[dv][max_v][t_cls] = max(estimate, MIN_PROB) class Model5Counts(Counts): """ Data object to store counts of various parameters during training. Includes counts for vacancies. """ def __init__(self): super(Model5Counts, self).__init__() self.head_vacancy = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) self.head_vacancy_for_any_dv = defaultdict( lambda: defaultdict(lambda: 0.0)) self.non_head_vacancy = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: 0.0))) self.non_head_vacancy_for_any_dv = defaultdict( lambda: defaultdict(lambda: 0.0)) def update_vacancy(self, count, alignment_info, i, trg_classes, slots): """ :param count: Value to add to the vacancy counts :param alignment_info: Alignment under consideration :param i: Source word position under consideration :param trg_classes: Target word classes :param slots: Vacancy states of the slots in the target sentence. Output parameter that will be modified as new words are placed in the target sentence. """ tablet = alignment_info.cepts[i] tablet_length = len(tablet) total_vacancies = slots.vacancies_at(len(slots)) # case 1: NULL aligned words if tablet_length == 0: return # ignore zero fertility words # case 2: head word j = tablet[0] previous_cept = alignment_info.previous_cept(j) previous_center = alignment_info.center_of_cept(previous_cept) dv = slots.vacancies_at(j) - slots.vacancies_at(previous_center) max_v = total_vacancies - tablet_length + 1 trg_class = trg_classes[alignment_info.trg_sentence[j]] self.head_vacancy[dv][max_v][trg_class] += count self.head_vacancy_for_any_dv[max_v][trg_class] += count slots.occupy(j) # mark position as occupied total_vacancies -= 1 # case 3: non-head words for k in range(1, tablet_length): previous_position = tablet[k - 1] previous_vacancies = slots.vacancies_at(previous_position) j = tablet[k] dv = slots.vacancies_at(j) - previous_vacancies max_v = (total_vacancies - tablet_length + k + 1 - previous_vacancies) trg_class = trg_classes[alignment_info.trg_sentence[j]] self.non_head_vacancy[dv][max_v][trg_class] += count self.non_head_vacancy_for_any_dv[max_v][trg_class] += count slots.occupy(j) # mark position as occupied total_vacancies -= 1 class Slots(object): """ Represents positions in a target sentence. Used to keep track of which slot (position) is occupied. """ def __init__(self, target_sentence_length): self._slots = [False] * (target_sentence_length + 1) # 1-indexed def occupy(self, position): """ :return: Mark slot at ``position`` as occupied """ self._slots[position] = True def vacancies_at(self, position): """ :return: Number of vacant slots up to, and including, ``position`` """ vacancies = 0 for k in range(1, position + 1): if not self._slots[k]: vacancies += 1 return vacancies def __len__(self): return len(self._slots) - 1 # exclude dummy zeroeth element nltk-3.1/nltk/translate/ibm_model.py0000644000076500000240000004651312607224144017335 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: IBM Model Core # # Copyright (C) 2001-2015 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ Common methods and classes for all IBM models. See ``IBMModel1``, ``IBMModel2``, ``IBMModel3``, ``IBMModel4``, and ``IBMModel5`` for specific implementations. The IBM models are a series of generative models that learn lexical translation probabilities, p(target language word|source language word), given a sentence-aligned parallel corpus. The models increase in sophistication from model 1 to 5. Typically, the output of lower models is used to seed the higher models. All models use the Expectation-Maximization (EM) algorithm to learn various probability tables. Words in a sentence are one-indexed. The first word of a sentence has position 1, not 0. Index 0 is reserved in the source sentence for the NULL token. The concept of position does not apply to NULL, but it is indexed at 0 by convention. Each target word is aligned to exactly one source word or the NULL token. References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. Peter E Brown, Stephen A. Della Pietra, Vincent J. Della Pietra, and Robert L. Mercer. 1993. The Mathematics of Statistical Machine Translation: Parameter Estimation. Computational Linguistics, 19 (2), 263-311. """ from bisect import insort_left from collections import defaultdict from copy import deepcopy from math import ceil def longest_target_sentence_length(sentence_aligned_corpus): """ :param sentence_aligned_corpus: Parallel corpus under consideration :type sentence_aligned_corpus: list(AlignedSent) :return: Number of words in the longest target language sentence of ``sentence_aligned_corpus`` """ max_m = 0 for aligned_sentence in sentence_aligned_corpus: m = len(aligned_sentence.words) max_m = max(m, max_m) return max_m class IBMModel(object): """ Abstract base class for all IBM models """ # Avoid division by zero and precision errors by imposing a minimum # value for probabilities. Note that this approach is theoretically # incorrect, since it may create probabilities that sum to more # than 1. In practice, the contribution of probabilities with MIN_PROB # is tiny enough that the value of MIN_PROB can be treated as zero. MIN_PROB = 1.0e-12 # GIZA++ is more liberal and uses 1.0e-7 def __init__(self, sentence_aligned_corpus): self.init_vocab(sentence_aligned_corpus) self.reset_probabilities() def reset_probabilities(self): self.translation_table = defaultdict( lambda: defaultdict(lambda: IBMModel.MIN_PROB)) """ dict[str][str]: float. Probability(target word | source word). Values accessed as ``translation_table[target_word][source_word]``. """ self.alignment_table = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: IBMModel.MIN_PROB)))) """ dict[int][int][int][int]: float. Probability(i | j,l,m). Values accessed as ``alignment_table[i][j][l][m]``. Used in model 2 and hill climbing in models 3 and above """ self.fertility_table = defaultdict( lambda: defaultdict(lambda: self.MIN_PROB)) """ dict[int][str]: float. Probability(fertility | source word). Values accessed as ``fertility_table[fertility][source_word]``. Used in model 3 and higher. """ self.p1 = 0.5 """ Probability that a generated word requires another target word that is aligned to NULL. Used in model 3 and higher. """ def set_uniform_probabilities(self, sentence_aligned_corpus): """ Initialize probability tables to a uniform distribution Derived classes should implement this accordingly. """ pass def init_vocab(self, sentence_aligned_corpus): src_vocab = set() trg_vocab = set() for aligned_sentence in sentence_aligned_corpus: trg_vocab.update(aligned_sentence.words) src_vocab.update(aligned_sentence.mots) # Add the NULL token src_vocab.add(None) self.src_vocab = src_vocab """ set(str): All source language words used in training """ self.trg_vocab = trg_vocab """ set(str): All target language words used in training """ def sample(self, sentence_pair): """ Sample the most probable alignments from the entire alignment space First, determine the best alignment according to IBM Model 2. With this initial alignment, use hill climbing to determine the best alignment according to a higher IBM Model. Add this alignment and its neighbors to the sample set. Repeat this process with other initial alignments obtained by pegging an alignment point. Hill climbing may be stuck in a local maxima, hence the pegging and trying out of different alignments. :param sentence_pair: Source and target language sentence pair to generate a sample of alignments from :type sentence_pair: AlignedSent :return: A set of best alignments represented by their ``AlignmentInfo`` and the best alignment of the set for convenience :rtype: set(AlignmentInfo), AlignmentInfo """ sampled_alignments = set() l = len(sentence_pair.mots) m = len(sentence_pair.words) # Start from the best model 2 alignment initial_alignment = self.best_model2_alignment(sentence_pair) potential_alignment = self.hillclimb(initial_alignment) sampled_alignments.update(self.neighboring(potential_alignment)) best_alignment = potential_alignment # Start from other model 2 alignments, # with the constraint that j is aligned (pegged) to i for j in range(1, m + 1): for i in range(0, l + 1): initial_alignment = self.best_model2_alignment( sentence_pair, j, i) potential_alignment = self.hillclimb(initial_alignment, j) neighbors = self.neighboring(potential_alignment, j) sampled_alignments.update(neighbors) if potential_alignment.score > best_alignment.score: best_alignment = potential_alignment return sampled_alignments, best_alignment def best_model2_alignment(self, sentence_pair, j_pegged=None, i_pegged=0): """ Finds the best alignment according to IBM Model 2 Used as a starting point for hill climbing in Models 3 and above, because it is easier to compute than the best alignments in higher models :param sentence_pair: Source and target language sentence pair to be word-aligned :type sentence_pair: AlignedSent :param j_pegged: If specified, the alignment point of j_pegged will be fixed to i_pegged :type j_pegged: int :param i_pegged: Alignment point to j_pegged :type i_pegged: int """ src_sentence = [None] + sentence_pair.mots trg_sentence = ['UNUSED'] + sentence_pair.words # 1-indexed l = len(src_sentence) - 1 # exclude NULL m = len(trg_sentence) - 1 alignment = [0] * (m + 1) # init all alignments to NULL cepts = [[] for i in range((l + 1))] # init all cepts to empty list for j in range(1, m + 1): if j == j_pegged: # use the pegged alignment instead of searching for best one best_i = i_pegged else: best_i = 0 max_alignment_prob = IBMModel.MIN_PROB t = trg_sentence[j] for i in range(0, l + 1): s = src_sentence[i] alignment_prob = (self.translation_table[t][s] * self.alignment_table[i][j][l][m]) if alignment_prob >= max_alignment_prob: max_alignment_prob = alignment_prob best_i = i alignment[j] = best_i cepts[best_i].append(j) return AlignmentInfo(tuple(alignment), tuple(src_sentence), tuple(trg_sentence), cepts) def hillclimb(self, alignment_info, j_pegged=None): """ Starting from the alignment in ``alignment_info``, look at neighboring alignments iteratively for the best one There is no guarantee that the best alignment in the alignment space will be found, because the algorithm might be stuck in a local maximum. :param j_pegged: If specified, the search will be constrained to alignments where ``j_pegged`` remains unchanged :type j_pegged: int :return: The best alignment found from hill climbing :rtype: AlignmentInfo """ alignment = alignment_info # alias with shorter name max_probability = self.prob_t_a_given_s(alignment) while True: old_alignment = alignment for neighbor_alignment in self.neighboring(alignment, j_pegged): neighbor_probability = self.prob_t_a_given_s(neighbor_alignment) if neighbor_probability > max_probability: alignment = neighbor_alignment max_probability = neighbor_probability if alignment == old_alignment: # Until there are no better alignments break alignment.score = max_probability return alignment def neighboring(self, alignment_info, j_pegged=None): """ Determine the neighbors of ``alignment_info``, obtained by moving or swapping one alignment point :param j_pegged: If specified, neighbors that have a different alignment point from j_pegged will not be considered :type j_pegged: int :return: A set neighboring alignments represented by their ``AlignmentInfo`` :rtype: set(AlignmentInfo) """ neighbors = set() l = len(alignment_info.src_sentence) - 1 # exclude NULL m = len(alignment_info.trg_sentence) - 1 original_alignment = alignment_info.alignment original_cepts = alignment_info.cepts for j in range(1, m + 1): if j != j_pegged: # Add alignments that differ by one alignment point for i in range(0, l + 1): new_alignment = list(original_alignment) new_cepts = deepcopy(original_cepts) old_i = original_alignment[j] # update alignment new_alignment[j] = i # update cepts insort_left(new_cepts[i], j) new_cepts[old_i].remove(j) new_alignment_info = AlignmentInfo( tuple(new_alignment), alignment_info.src_sentence, alignment_info.trg_sentence, new_cepts) neighbors.add(new_alignment_info) for j in range(1, m + 1): if j != j_pegged: # Add alignments that have two alignment points swapped for other_j in range(1, m + 1): if other_j != j_pegged and other_j != j: new_alignment = list(original_alignment) new_cepts = deepcopy(original_cepts) other_i = original_alignment[other_j] i = original_alignment[j] # update alignments new_alignment[j] = other_i new_alignment[other_j] = i # update cepts new_cepts[other_i].remove(other_j) insort_left(new_cepts[other_i], j) new_cepts[i].remove(j) insort_left(new_cepts[i], other_j) new_alignment_info = AlignmentInfo( tuple(new_alignment), alignment_info.src_sentence, alignment_info.trg_sentence, new_cepts) neighbors.add(new_alignment_info) return neighbors def maximize_lexical_translation_probabilities(self, counts): for t, src_words in counts.t_given_s.items(): for s in src_words: estimate = counts.t_given_s[t][s] / counts.any_t_given_s[s] self.translation_table[t][s] = max(estimate, IBMModel.MIN_PROB) def maximize_fertility_probabilities(self, counts): for phi, src_words in counts.fertility.items(): for s in src_words: estimate = (counts.fertility[phi][s] / counts.fertility_for_any_phi[s]) self.fertility_table[phi][s] = max(estimate, IBMModel.MIN_PROB) def maximize_null_generation_probabilities(self, counts): p1_estimate = counts.p1 / (counts.p1 + counts.p0) p1_estimate = max(p1_estimate, IBMModel.MIN_PROB) # Clip p1 if it is too large, because p0 = 1 - p1 should not be # smaller than MIN_PROB self.p1 = min(p1_estimate, 1 - IBMModel.MIN_PROB) def prob_of_alignments(self, alignments): probability = 0 for alignment_info in alignments: probability += self.prob_t_a_given_s(alignment_info) return probability def prob_t_a_given_s(self, alignment_info): """ Probability of target sentence and an alignment given the source sentence All required information is assumed to be in ``alignment_info`` and self. Derived classes should override this method """ return 0.0 class AlignmentInfo(object): """ Helper data object for training IBM Models 3 and up Read-only. For a source sentence and its counterpart in the target language, this class holds information about the sentence pair's alignment, cepts, and fertility. Warning: Alignments are one-indexed here, in contrast to nltk.translate.Alignment and AlignedSent, which are zero-indexed This class is not meant to be used outside of IBM models. """ def __init__(self, alignment, src_sentence, trg_sentence, cepts): if not isinstance(alignment, tuple): raise TypeError("The alignment must be a tuple because it is used " "to uniquely identify AlignmentInfo objects.") self.alignment = alignment """ tuple(int): Alignment function. ``alignment[j]`` is the position in the source sentence that is aligned to the position j in the target sentence. """ self.src_sentence = src_sentence """ tuple(str): Source sentence referred to by this object. Should include NULL token (None) in index 0. """ self.trg_sentence = trg_sentence """ tuple(str): Target sentence referred to by this object. Should have a dummy element in index 0 so that the first word starts from index 1. """ self.cepts = cepts """ list(list(int)): The positions of the target words, in ascending order, aligned to a source word position. For example, cepts[4] = (2, 3, 7) means that words in positions 2, 3 and 7 of the target sentence are aligned to the word in position 4 of the source sentence """ self.score = None """ float: Optional. Probability of alignment, as defined by the IBM model that assesses this alignment """ def fertility_of_i(self, i): """ Fertility of word in position ``i`` of the source sentence """ return len(self.cepts[i]) def is_head_word(self, j): """ :return: Whether the word in position ``j`` of the target sentence is a head word """ i = self.alignment[j] return self.cepts[i][0] == j def center_of_cept(self, i): """ :return: The ceiling of the average positions of the words in the tablet of cept ``i``, or 0 if ``i`` is None """ if i is None: return 0 average_position = float(sum(self.cepts[i])) / len(self.cepts[i]) return int(ceil(average_position)) def previous_cept(self, j): """ :return: The previous cept of ``j``, or None if ``j`` belongs to the first cept """ i = self.alignment[j] if i == 0: raise ValueError("Words aligned to NULL cannot have a previous " "cept because NULL has no position") previous_cept = i - 1 while previous_cept > 0 and self.fertility_of_i(previous_cept) == 0: previous_cept -= 1 if previous_cept <= 0: previous_cept = None return previous_cept def previous_in_tablet(self, j): """ :return: The position of the previous word that is in the same tablet as ``j``, or None if ``j`` is the first word of the tablet """ i = self.alignment[j] tablet_position = self.cepts[i].index(j) if tablet_position == 0: return None return self.cepts[i][tablet_position - 1] def zero_indexed_alignment(self): """ :return: Zero-indexed alignment, suitable for use in external ``nltk.translate`` modules like ``nltk.translate.Alignment`` :rtype: list(tuple) """ zero_indexed_alignment = [] for j in range(1, len(self.trg_sentence)): i = self.alignment[j] - 1 if i < 0: i = None # alignment to NULL token zero_indexed_alignment.append((j - 1, i)) return zero_indexed_alignment def __eq__(self, other): return self.alignment == other.alignment def __hash__(self): return hash(self.alignment) class Counts(object): """ Data object to store counts of various parameters during training """ def __init__(self): self.t_given_s = defaultdict(lambda: defaultdict(lambda: 0.0)) self.any_t_given_s = defaultdict(lambda: 0.0) self.p0 = 0.0 self.p1 = 0.0 self.fertility = defaultdict(lambda: defaultdict(lambda: 0.0)) self.fertility_for_any_phi = defaultdict(lambda: 0.0) def update_lexical_translation(self, count, alignment_info, j): i = alignment_info.alignment[j] t = alignment_info.trg_sentence[j] s = alignment_info.src_sentence[i] self.t_given_s[t][s] += count self.any_t_given_s[s] += count def update_null_generation(self, count, alignment_info): m = len(alignment_info.trg_sentence) - 1 fertility_of_null = alignment_info.fertility_of_i(0) self.p1 += fertility_of_null * count self.p0 += (m - 2 * fertility_of_null) * count def update_fertility(self, count, alignment_info): for i in range(0, len(alignment_info.src_sentence)): s = alignment_info.src_sentence[i] phi = alignment_info.fertility_of_i(i) self.fertility[phi][s] += count self.fertility_for_any_phi[s] += count nltk-3.1/nltk/translate/metrics.py0000644000076500000240000000267712607224144017057 0ustar sbstaff00000000000000# Natural Language Toolkit: Translation metrics # # Copyright (C) 2001-2015 NLTK Project # Author: Will Zhang # Guan Gui # Steven Bird # URL: # For license information, see LICENSE.TXT def alignment_error_rate(reference, hypothesis, possible=None): """ Return the Alignment Error Rate (AER) of an alignment with respect to a "gold standard" reference alignment. Return an error rate between 0.0 (perfect alignment) and 1.0 (no alignment). >>> from nltk.translate import Alignment >>> ref = Alignment([(0, 0), (1, 1), (2, 2)]) >>> test = Alignment([(0, 0), (1, 2), (2, 1)]) >>> alignment_error_rate(ref, test) # doctest: +ELLIPSIS 0.6666666666666667 :type reference: Alignment :param reference: A gold standard alignment (sure alignments) :type hypothesis: Alignment :param hypothesis: A hypothesis alignment (aka. candidate alignments) :type possible: Alignment or None :param possible: A gold standard reference of possible alignments (defaults to *reference* if None) :rtype: float or None """ if possible is None: possible = reference else: assert(reference.issubset(possible)) # sanity check return (1.0 - float(len(hypothesis & reference) + len(hypothesis & possible)) / float(len(hypothesis) + len(reference))) nltk-3.1/nltk/translate/phrase_based.py0000644000076500000240000001676712607224144020036 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Phrase Extraction Algorithm # # Copyright (C) 2001-2015 NLTK Project # Authors: Liling Tan, Fredrik Hedman, Petra Barancikova # URL: # For license information, see LICENSE.TXT def extract(f_start, f_end, e_start, e_end, alignment, f_aligned, srctext, trgtext, srclen, trglen, max_phrase_length): """ This function checks for alignment point consistency and extracts phrases using the chunk of consistent phrases. A phrase pair (e, f ) is consistent with an alignment A if and only if: (i) No English words in the phrase pair are aligned to words outside it. ∀e i ∈ e, (e i , f j ) ∈ A ⇒ f j ∈ f (ii) No Foreign words in the phrase pair are aligned to words outside it. ∀f j ∈ f , (e i , f j ) ∈ A ⇒ e i ∈ e (iii) The phrase pair contains at least one alignment point. ∃e i ∈ e Ì„ , f j ∈ f Ì„ s.t. (e i , f j ) ∈ A :type f_start: int :param f_start: Starting index of the possible foreign language phrases :type f_end: int :param f_end: Starting index of the possible foreign language phrases :type e_start: int :param e_start: Starting index of the possible source language phrases :type e_end: int :param e_end: Starting index of the possible source language phrases :type srctext: list :param srctext: The source language tokens, a list of string. :type trgtext: list :param trgtext: The target language tokens, a list of string. :type srclen: int :param srclen: The number of tokens in the source language tokens. :type trglen: int :param trglen: The number of tokens in the target language tokens. """ if f_end < 0: # 0-based indexing. return {} # Check if alignment points are consistent. for e,f in alignment: if ((f_start <= f <= f_end) and (e < e_start or e > e_end)): return {} # Add phrase pairs (incl. additional unaligned f) phrases = set() fs = f_start while True: fe = min(f_end, f_start + max_phrase_length - 1) while True: # add phrase pair ([e_start, e_end], [fs, fe]) to set E # Need to +1 in range to include the end-point. src_phrase = " ".join(srctext[e_start:e_end+1]) trg_phrase = " ".join(trgtext[fs:fe+1]) # Include more data for later ordering. phrases.add(((e_start, e_end+1), (f_start, f_end+1), src_phrase, trg_phrase)) fe += 1 if fe in f_aligned or fe == trglen: break fs -=1 if fs in f_aligned or fs < 0: break return phrases def phrase_extraction(srctext, trgtext, alignment, max_phrase_length=0): """ Phrase extraction algorithm extracts all consistent phrase pairs from a word-aligned sentence pair. The idea is to loop over all possible source language (e) phrases and find the minimal foreign phrase (f) that matches each of them. Matching is done by identifying all alignment points for the source phrase and finding the shortest foreign phrase that includes all the foreign counterparts for the source words. In short, a phrase alignment has to (a) contain all alignment points for all covered words (b) contain at least one alignment point >>> srctext = "michael assumes that he will stay in the house" >>> trgtext = "michael geht davon aus , dass er im haus bleibt" >>> alignment = [(0,0), (1,1), (1,2), (1,3), (2,5), (3,6), (4,9), ... (5,9), (6,7), (7,7), (8,8)] >>> phrases = phrase_extraction(srctext, trgtext, alignment) >>> for i in sorted(phrases): ... print(i) ... ((0, 1), (0, 1), 'michael', 'michael') ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus') ((0, 2), (0, 4), 'michael assumes', 'michael geht davon aus ,') ((0, 3), (0, 6), 'michael assumes that', 'michael geht davon aus , dass') ((0, 4), (0, 7), 'michael assumes that he', 'michael geht davon aus , dass er') ((0, 9), (0, 10), 'michael assumes that he will stay in the house', 'michael geht davon aus , dass er im haus bleibt') ((1, 2), (1, 4), 'assumes', 'geht davon aus') ((1, 2), (1, 4), 'assumes', 'geht davon aus ,') ((1, 3), (1, 6), 'assumes that', 'geht davon aus , dass') ((1, 4), (1, 7), 'assumes that he', 'geht davon aus , dass er') ((1, 9), (1, 10), 'assumes that he will stay in the house', 'geht davon aus , dass er im haus bleibt') ((2, 3), (5, 6), 'that', ', dass') ((2, 3), (5, 6), 'that', 'dass') ((2, 4), (5, 7), 'that he', ', dass er') ((2, 4), (5, 7), 'that he', 'dass er') ((2, 9), (5, 10), 'that he will stay in the house', ', dass er im haus bleibt') ((2, 9), (5, 10), 'that he will stay in the house', 'dass er im haus bleibt') ((3, 4), (6, 7), 'he', 'er') ((3, 9), (6, 10), 'he will stay in the house', 'er im haus bleibt') ((4, 6), (9, 10), 'will stay', 'bleibt') ((4, 9), (7, 10), 'will stay in the house', 'im haus bleibt') ((6, 8), (7, 8), 'in the', 'im') ((6, 9), (7, 9), 'in the house', 'im haus') ((8, 9), (8, 9), 'house', 'haus') :type srctext: str :param srctext: The sentence string from the source language. :type trgtext: str :param trgtext: The sentence string from the target language. :type alignment: str :param alignment: The word alignment outputs as list of tuples, where the first elements of tuples are the source words' indices and second elements are the target words' indices. This is also the output format of nltk.translate.ibm1 :rtype: list(tuple) :return: A list of tuples, each element in a list is a phrase and each phrase is a tuple made up of (i) its source location, (ii) its target location, (iii) the source phrase and (iii) the target phrase. The phrase list of tuples represents all the possible phrases extracted from the word alignments. :type max_phrase_length: int :param max_phrase_length: maximal phrase length, if 0 or not specified it is set to a length of the longer sentence (srctext or trgtext). """ srctext = srctext.split() # e trgtext = trgtext.split() # f srclen = len(srctext) # len(e) trglen = len(trgtext) # len(f) # Keeps an index of which source/target words that are aligned. f_aligned = [j for _,j in alignment] max_phrase_length = max_phrase_length or max(srclen,trglen) # set of phrase pairs BP bp = set() for e_start in range(srclen): max_idx = min(srclen, e_start + max_phrase_length) for e_end in range(e_start, max_idx): # // find the minimally matching foreign phrase # (f start , f end ) = ( length(f), 0 ) # f_start ∈ [0, len(f) - 1]; f_end ∈ [0, len(f) - 1] f_start, f_end = trglen-1 , -1 # 0-based indexing for e,f in alignment: if e_start <= e <= e_end: f_start = min(f, f_start) f_end = max(f, f_end) # add extract (f start , f end , e start , e end ) to set BP phrases = extract(f_start, f_end, e_start, e_end, alignment, f_aligned, srctext, trgtext, srclen, trglen, max_phrase_length) if phrases: bp.update(phrases) return bp nltk-3.1/nltk/translate/stack_decoder.py0000644000076500000240000004672712607224144020207 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Stack decoder # # Copyright (C) 2001-2015 NLTK Project # Author: Tah Wei Hoon # URL: # For license information, see LICENSE.TXT """ A decoder that uses stacks to implement phrase-based translation. In phrase-based translation, the source sentence is segmented into phrases of one or more words, and translations for those phrases are used to build the target sentence. Hypothesis data structures are used to keep track of the source words translated so far and the partial output. A hypothesis can be expanded by selecting an untranslated phrase, looking up its translation in a phrase table, and appending that translation to the partial output. Translation is complete when a hypothesis covers all source words. The search space is huge because the source sentence can be segmented in different ways, the source phrases can be selected in any order, and there could be multiple translations for the same source phrase in the phrase table. To make decoding tractable, stacks are used to limit the number of candidate hypotheses by doing histogram and/or threshold pruning. Hypotheses with the same number of words translated are placed in the same stack. In histogram pruning, each stack has a size limit, and the hypothesis with the lowest score is removed when the stack is full. In threshold pruning, hypotheses that score below a certain threshold of the best hypothesis in that stack are removed. Hypothesis scoring can include various factors such as phrase translation probability, language model probability, length of translation, cost of remaining words to be translated, and so on. References: Philipp Koehn. 2010. Statistical Machine Translation. Cambridge University Press, New York. """ import warnings from collections import defaultdict from math import log class StackDecoder(object): """ Phrase-based stack decoder for machine translation >>> from nltk.translate import PhraseTable >>> phrase_table = PhraseTable() >>> phrase_table.add(('niemand',), ('nobody',), log(0.8)) >>> phrase_table.add(('niemand',), ('no', 'one'), log(0.2)) >>> phrase_table.add(('erwartet',), ('expects',), log(0.8)) >>> phrase_table.add(('erwartet',), ('expecting',), log(0.2)) >>> phrase_table.add(('niemand', 'erwartet'), ('one', 'does', 'not', 'expect'), log(0.1)) >>> phrase_table.add(('die', 'spanische', 'inquisition'), ('the', 'spanish', 'inquisition'), log(0.8)) >>> phrase_table.add(('!',), ('!',), log(0.8)) >>> # nltk.model should be used here once it is implemented >>> from collections import defaultdict >>> language_prob = defaultdict(lambda: -999.0) >>> language_prob[('nobody',)] = log(0.5) >>> language_prob[('expects',)] = log(0.4) >>> language_prob[('the', 'spanish', 'inquisition')] = log(0.2) >>> language_prob[('!',)] = log(0.1) >>> language_model = type('',(object,),{'probability_change': lambda self, context, phrase: language_prob[phrase], 'probability': lambda self, phrase: language_prob[phrase]})() >>> stack_decoder = StackDecoder(phrase_table, language_model) >>> stack_decoder.translate(['niemand', 'erwartet', 'die', 'spanische', 'inquisition', '!']) ['nobody', 'expects', 'the', 'spanish', 'inquisition', '!'] """ def __init__(self, phrase_table, language_model): """ :param phrase_table: Table of translations for source language phrases and the log probabilities for those translations. :type phrase_table: PhraseTable :param language_model: Target language model. Must define a ``probability_change`` method that calculates the change in log probability of a sentence, if a given string is appended to it. This interface is experimental and will likely be replaced with nltk.model once it is implemented. :type language_model: object """ self.phrase_table = phrase_table self.language_model = language_model self.word_penalty = 0.0 """ float: Influences the translation length exponentially. If positive, shorter translations are preferred. If negative, longer translations are preferred. If zero, no penalty is applied. """ self.beam_threshold = 0.0 """ float: Hypotheses that score below this factor of the best hypothesis in a stack are dropped from consideration. Value between 0.0 and 1.0. """ self.stack_size = 100 """ int: Maximum number of hypotheses to consider in a stack. Higher values increase the likelihood of a good translation, but increases processing time. """ self.__distortion_factor = 0.5 self.__compute_log_distortion() @property def distortion_factor(self): """ float: Amount of reordering of source phrases. Lower values favour monotone translation, suitable when word order is similar for both source and target languages. Value between 0.0 and 1.0. Default 0.5. """ return self.__distortion_factor @distortion_factor.setter def distortion_factor(self, d): self.__distortion_factor = d self.__compute_log_distortion() def __compute_log_distortion(self): # cache log(distortion_factor) so we don't have to recompute it # when scoring hypotheses if self.__distortion_factor == 0.0: self.__log_distortion_factor = log(1e-9) # 1e-9 is almost zero else: self.__log_distortion_factor = log(self.__distortion_factor) def translate(self, src_sentence): """ :param src_sentence: Sentence to be translated :type src_sentence: list(str) :return: Translated sentence :rtype: list(str) """ sentence = tuple(src_sentence) # prevent accidental modification sentence_length = len(sentence) stacks = [_Stack(self.stack_size, self.beam_threshold) for _ in range(0, sentence_length + 1)] empty_hypothesis = _Hypothesis() stacks[0].push(empty_hypothesis) all_phrases = self.find_all_src_phrases(sentence) future_score_table = self.compute_future_scores(sentence) for stack in stacks: for hypothesis in stack: possible_expansions = StackDecoder.valid_phrases(all_phrases, hypothesis) for src_phrase_span in possible_expansions: src_phrase = sentence[src_phrase_span[0]:src_phrase_span[1]] for translation_option in (self.phrase_table. translations_for(src_phrase)): raw_score = self.expansion_score( hypothesis, translation_option, src_phrase_span) new_hypothesis = _Hypothesis( raw_score=raw_score, src_phrase_span=src_phrase_span, trg_phrase=translation_option.trg_phrase, previous=hypothesis ) new_hypothesis.future_score = self.future_score( new_hypothesis, future_score_table, sentence_length) total_words = new_hypothesis.total_translated_words() stacks[total_words].push(new_hypothesis) if not stacks[sentence_length]: warnings.warn('Unable to translate all words. ' 'The source sentence contains words not in ' 'the phrase table') # Instead of returning empty output, perhaps a partial # translation could be returned return [] best_hypothesis = stacks[sentence_length].best() return best_hypothesis.translation_so_far() def find_all_src_phrases(self, src_sentence): """ Finds all subsequences in src_sentence that have a phrase translation in the translation table :type src_sentence: tuple(str) :return: Subsequences that have a phrase translation, represented as a table of lists of end positions. For example, if result[2] is [5, 6, 9], then there are three phrases starting from position 2 in ``src_sentence``, ending at positions 5, 6, and 9 exclusive. The list of ending positions are in ascending order. :rtype: list(list(int)) """ sentence_length = len(src_sentence) phrase_indices = [[] for _ in src_sentence] for start in range(0, sentence_length): for end in range(start + 1, sentence_length + 1): potential_phrase = src_sentence[start:end] if potential_phrase in self.phrase_table: phrase_indices[start].append(end) return phrase_indices def compute_future_scores(self, src_sentence): """ Determines the approximate scores for translating every subsequence in ``src_sentence`` Future scores can be used a look-ahead to determine the difficulty of translating the remaining parts of a src_sentence. :type src_sentence: tuple(str) :return: Scores of subsequences referenced by their start and end positions. For example, result[2][5] is the score of the subsequence covering positions 2, 3, and 4. :rtype: dict(int: (dict(int): float)) """ scores = defaultdict(lambda: defaultdict(lambda: float('-inf'))) for seq_length in range(1, len(src_sentence) + 1): for start in range(0, len(src_sentence) - seq_length + 1): end = start + seq_length phrase = src_sentence[start:end] if phrase in self.phrase_table: score = self.phrase_table.translations_for( phrase)[0].log_prob # pick best (first) translation # Warning: API of language_model is subject to change score += self.language_model.probability(phrase) scores[start][end] = score # check if a better score can be obtained by combining # two child subsequences for mid in range(start + 1, end): combined_score = (scores[start][mid] + scores[mid][end]) if combined_score > scores[start][end]: scores[start][end] = combined_score return scores def future_score(self, hypothesis, future_score_table, sentence_length): """ Determines the approximate score for translating the untranslated words in ``hypothesis`` """ score = 0.0 for span in hypothesis.untranslated_spans(sentence_length): score += future_score_table[span[0]][span[1]] return score def expansion_score(self, hypothesis, translation_option, src_phrase_span): """ Calculate the score of expanding ``hypothesis`` with ``translation_option`` :param hypothesis: Hypothesis being expanded :type hypothesis: _Hypothesis :param translation_option: Information about the proposed expansion :type translation_option: PhraseTableEntry :param src_phrase_span: Word position span of the source phrase :type src_phrase_span: tuple(int, int) """ score = hypothesis.raw_score score += translation_option.log_prob # The API of language_model is subject to change; it could accept # a string, a list of words, and/or some other type score += self.language_model.probability_change( hypothesis, translation_option.trg_phrase) score += self.distortion_score(hypothesis, src_phrase_span) score -= self.word_penalty * len(translation_option.trg_phrase) return score def distortion_score(self, hypothesis, next_src_phrase_span): if not hypothesis.src_phrase_span: return 0.0 next_src_phrase_start = next_src_phrase_span[0] prev_src_phrase_end = hypothesis.src_phrase_span[1] distortion_distance = next_src_phrase_start - prev_src_phrase_end return abs(distortion_distance) * self.__log_distortion_factor @staticmethod def valid_phrases(all_phrases_from, hypothesis): """ Extract phrases from ``all_phrases_from`` that contains words that have not been translated by ``hypothesis`` :param all_phrases_from: Phrases represented by their spans, in the same format as the return value of ``find_all_src_phrases`` :type all_phrases_from: list(list(int)) :type hypothesis: _Hypothesis :return: A list of phrases, represented by their spans, that cover untranslated positions. :rtype: list(tuple(int, int)) """ untranslated_spans = hypothesis.untranslated_spans( len(all_phrases_from)) valid_phrases = [] for available_span in untranslated_spans: start = available_span[0] available_end = available_span[1] while start < available_end: for phrase_end in all_phrases_from[start]: if phrase_end > available_end: # Subsequent elements in all_phrases_from[start] # will also be > available_end, since the # elements are in ascending order break valid_phrases.append((start, phrase_end)) start += 1 return valid_phrases class _Hypothesis(object): """ Partial solution to a translation. Records the word positions of the phrase being translated, its translation, raw score, and the cost of the untranslated parts of the sentence. When the next phrase is selected to build upon the partial solution, a new _Hypothesis object is created, with a back pointer to the previous hypothesis. To find out which words have been translated so far, look at the ``src_phrase_span`` in the hypothesis chain. Similarly, the translation output can be found by traversing up the chain. """ def __init__(self, raw_score=0.0, src_phrase_span=(), trg_phrase=(), previous=None, future_score=0.0): """ :param raw_score: Likelihood of hypothesis so far. Higher is better. Does not account for untranslated words. :type raw_score: float :param src_phrase_span: Span of word positions covered by the source phrase in this hypothesis expansion. For example, (2, 5) means that the phrase is from the second word up to, but not including the fifth word in the source sentence. :type src_phrase_span: tuple(int) :param trg_phrase: Translation of the source phrase in this hypothesis expansion :type trg_phrase: tuple(str) :param previous: Previous hypothesis before expansion to this one :type previous: _Hypothesis :param future_score: Approximate score for translating the remaining words not covered by this hypothesis. Higher means that the remaining words are easier to translate. :type future_score: float """ self.raw_score = raw_score self.src_phrase_span = src_phrase_span self.trg_phrase = trg_phrase self.previous = previous self.future_score = future_score def score(self): """ Overall score of hypothesis after accounting for local and global features """ return self.raw_score + self.future_score def untranslated_spans(self, sentence_length): """ Starting from each untranslated word, find the longest continuous span of untranslated positions :param sentence_length: Length of source sentence being translated by the hypothesis :type sentence_length: int :rtype: list(tuple(int, int)) """ translated_positions = self.translated_positions() translated_positions.sort() translated_positions.append(sentence_length) # add sentinel position untranslated_spans = [] start = 0 # each untranslated span must end in one of the translated_positions for end in translated_positions: if start < end: untranslated_spans.append((start, end)) start = end + 1 return untranslated_spans def translated_positions(self): """ List of positions in the source sentence of words already translated. The list is not sorted. :rtype: list(int) """ translated_positions = [] current_hypothesis = self while current_hypothesis.previous is not None: translated_span = current_hypothesis.src_phrase_span translated_positions.extend(range(translated_span[0], translated_span[1])) current_hypothesis = current_hypothesis.previous return translated_positions def total_translated_words(self): return len(self.translated_positions()) def translation_so_far(self): translation = [] self.__build_translation(self, translation) return translation def __build_translation(self, hypothesis, output): if hypothesis.previous is None: return self.__build_translation(hypothesis.previous, output) output.extend(hypothesis.trg_phrase) class _Stack(object): """ Collection of _Hypothesis objects """ def __init__(self, max_size=100, beam_threshold=0.0): """ :param beam_threshold: Hypotheses that score less than this factor of the best hypothesis are discarded from the stack. Value must be between 0.0 and 1.0. :type beam_threshold: float """ self.max_size = max_size self.items = [] if beam_threshold == 0.0: self.__log_beam_threshold = float('-inf') else: self.__log_beam_threshold = log(beam_threshold) def push(self, hypothesis): """ Add ``hypothesis`` to the stack. Removes lowest scoring hypothesis if the stack is full. After insertion, hypotheses that score less than ``beam_threshold`` times the score of the best hypothesis are removed. """ self.items.append(hypothesis) self.items.sort(key=lambda h: h.score(), reverse=True) while len(self.items) > self.max_size: self.items.pop() self.threshold_prune() def threshold_prune(self): if not self.items: return # log(score * beam_threshold) = log(score) + log(beam_threshold) threshold = self.items[0].score() + self.__log_beam_threshold for hypothesis in reversed(self.items): if hypothesis.score() < threshold: self.items.pop() else: break def best(self): """ :return: Hypothesis with the highest score in the stack :rtype: _Hypothesis """ if self.items: return self.items[0] return None def __iter__(self): return iter(self.items) def __contains__(self, hypothesis): return hypothesis in self.items nltk-3.1/nltk/tree.py0000644000076500000240000017556712607224144014364 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Text Trees # # Copyright (C) 2001-2015 NLTK Project # Author: Edward Loper # Steven Bird # Peter Ljunglöf # Nathan Bodenstab (tree transforms) # URL: # For license information, see LICENSE.TXT """ Class for representing hierarchical language structures, such as syntax trees and morphological trees. """ from __future__ import print_function, unicode_literals # TODO: add LabelledTree (can be used for dependency trees) import re from nltk.grammar import Production, Nonterminal from nltk.probability import ProbabilisticMixIn from nltk.util import slice_bounds from nltk.compat import string_types, python_2_unicode_compatible, unicode_repr from nltk.internals import raise_unorderable_types ###################################################################### ## Trees ###################################################################### @python_2_unicode_compatible class Tree(list): """ A Tree represents a hierarchical grouping of leaves and subtrees. For example, each constituent in a syntax tree is represented by a single Tree. A tree's children are encoded as a list of leaves and subtrees, where a leaf is a basic (non-tree) value; and a subtree is a nested Tree. >>> from nltk.tree import Tree >>> print(Tree(1, [2, Tree(3, [4]), 5])) (1 2 (3 4) 5) >>> vp = Tree('VP', [Tree('V', ['saw']), ... Tree('NP', ['him'])]) >>> s = Tree('S', [Tree('NP', ['I']), vp]) >>> print(s) (S (NP I) (VP (V saw) (NP him))) >>> print(s[1]) (VP (V saw) (NP him)) >>> print(s[1,1]) (NP him) >>> t = Tree.fromstring("(S (NP I) (VP (V saw) (NP him)))") >>> s == t True >>> t[1][1].set_label('X') >>> t[1][1].label() 'X' >>> print(t) (S (NP I) (VP (V saw) (X him))) >>> t[0], t[1,1] = t[1,1], t[0] >>> print(t) (S (X him) (VP (V saw) (NP I))) The length of a tree is the number of children it has. >>> len(t) 2 The set_label() and label() methods allow individual constituents to be labeled. For example, syntax trees use this label to specify phrase tags, such as "NP" and "VP". Several Tree methods use "tree positions" to specify children or descendants of a tree. Tree positions are defined as follows: - The tree position *i* specifies a Tree's *i*\ th child. - The tree position ``()`` specifies the Tree itself. - If *p* is the tree position of descendant *d*, then *p+i* specifies the *i*\ th child of *d*. I.e., every tree position is either a single index *i*, specifying ``tree[i]``; or a sequence *i1, i2, ..., iN*, specifying ``tree[i1][i2]...[iN]``. Construct a new tree. This constructor can be called in one of two ways: - ``Tree(label, children)`` constructs a new tree with the specified label and list of children. - ``Tree.fromstring(s)`` constructs a new tree by parsing the string ``s``. """ def __init__(self, node, children=None): if children is None: raise TypeError("%s: Expected a node value and child list " % type(self).__name__) elif isinstance(children, string_types): raise TypeError("%s() argument 2 should be a list, not a " "string" % type(self).__name__) else: list.__init__(self, children) self._label = node #//////////////////////////////////////////////////////////// # Comparison operators #//////////////////////////////////////////////////////////// def __eq__(self, other): return (self.__class__ is other.__class__ and (self._label, list(self)) == (other._label, list(other))) def __lt__(self, other): if not isinstance(other, Tree): # raise_unorderable_types("<", self, other) # Sometimes children can be pure strings, # so we need to be able to compare with non-trees: return self.__class__.__name__ < other.__class__.__name__ elif self.__class__ is other.__class__: return (self._label, list(self)) < (other._label, list(other)) else: return self.__class__.__name__ < other.__class__.__name__ # @total_ordering doesn't work here, since the class inherits from a builtin class __ne__ = lambda self, other: not self == other __gt__ = lambda self, other: not (self < other or self == other) __le__ = lambda self, other: self < other or self == other __ge__ = lambda self, other: not self < other #//////////////////////////////////////////////////////////// # Disabled list operations #//////////////////////////////////////////////////////////// def __mul__(self, v): raise TypeError('Tree does not support multiplication') def __rmul__(self, v): raise TypeError('Tree does not support multiplication') def __add__(self, v): raise TypeError('Tree does not support addition') def __radd__(self, v): raise TypeError('Tree does not support addition') #//////////////////////////////////////////////////////////// # Indexing (with support for tree positions) #//////////////////////////////////////////////////////////// def __getitem__(self, index): if isinstance(index, (int, slice)): return list.__getitem__(self, index) elif isinstance(index, (list, tuple)): if len(index) == 0: return self elif len(index) == 1: return self[index[0]] else: return self[index[0]][index[1:]] else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__)) def __setitem__(self, index, value): if isinstance(index, (int, slice)): return list.__setitem__(self, index, value) elif isinstance(index, (list, tuple)): if len(index) == 0: raise IndexError('The tree position () may not be ' 'assigned to.') elif len(index) == 1: self[index[0]] = value else: self[index[0]][index[1:]] = value else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__)) def __delitem__(self, index): if isinstance(index, (int, slice)): return list.__delitem__(self, index) elif isinstance(index, (list, tuple)): if len(index) == 0: raise IndexError('The tree position () may not be deleted.') elif len(index) == 1: del self[index[0]] else: del self[index[0]][index[1:]] else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__)) #//////////////////////////////////////////////////////////// # Basic tree operations #//////////////////////////////////////////////////////////// def _get_node(self): """Outdated method to access the node value; use the label() method instead.""" raise NotImplementedError("Use label() to access a node label.") def _set_node(self, value): """Outdated method to set the node value; use the set_label() method instead.""" raise NotImplementedError("Use set_label() method to set a node label.") node = property(_get_node, _set_node) def label(self): """ Return the node label of the tree. >>> t = Tree.fromstring('(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))') >>> t.label() 'S' :return: the node label (typically a string) :rtype: any """ return self._label def set_label(self, label): """ Set the node label of the tree. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.set_label("T") >>> print(t) (T (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat)))) :param label: the node label (typically a string) :type label: any """ self._label = label def leaves(self): """ Return the leaves of the tree. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.leaves() ['the', 'dog', 'chased', 'the', 'cat'] :return: a list containing this tree's leaves. The order reflects the order of the leaves in the tree's hierarchical structure. :rtype: list """ leaves = [] for child in self: if isinstance(child, Tree): leaves.extend(child.leaves()) else: leaves.append(child) return leaves def flatten(self): """ Return a flat version of the tree, with all non-root non-terminals removed. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> print(t.flatten()) (S the dog chased the cat) :return: a tree consisting of this tree's root connected directly to its leaves, omitting all intervening non-terminal nodes. :rtype: Tree """ return Tree(self.label(), self.leaves()) def height(self): """ Return the height of the tree. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.height() 5 >>> print(t[0,0]) (D the) >>> t[0,0].height() 2 :return: The height of this tree. The height of a tree containing no children is 1; the height of a tree containing only leaves is 2; and the height of any other tree is one plus the maximum of its children's heights. :rtype: int """ max_child_height = 0 for child in self: if isinstance(child, Tree): max_child_height = max(max_child_height, child.height()) else: max_child_height = max(max_child_height, 1) return 1 + max_child_height def treepositions(self, order='preorder'): """ >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.treepositions() # doctest: +ELLIPSIS [(), (0,), (0, 0), (0, 0, 0), (0, 1), (0, 1, 0), (1,), (1, 0), (1, 0, 0), ...] >>> for pos in t.treepositions('leaves'): ... t[pos] = t[pos][::-1].upper() >>> print(t) (S (NP (D EHT) (N GOD)) (VP (V DESAHC) (NP (D EHT) (N TAC)))) :param order: One of: ``preorder``, ``postorder``, ``bothorder``, ``leaves``. """ positions = [] if order in ('preorder', 'bothorder'): positions.append( () ) for i, child in enumerate(self): if isinstance(child, Tree): childpos = child.treepositions(order) positions.extend((i,)+p for p in childpos) else: positions.append( (i,) ) if order in ('postorder', 'bothorder'): positions.append( () ) return positions def subtrees(self, filter=None): """ Generate all the subtrees of this tree, optionally restricted to trees matching the filter function. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> for s in t.subtrees(lambda t: t.height() == 2): ... print(s) (D the) (N dog) (V chased) (D the) (N cat) :type filter: function :param filter: the function to filter all local trees """ if not filter or filter(self): yield self for child in self: if isinstance(child, Tree): for subtree in child.subtrees(filter): yield subtree def productions(self): """ Generate the productions that correspond to the non-terminal nodes of the tree. For each subtree of the form (P: C1 C2 ... Cn) this produces a production of the form P -> C1 C2 ... Cn. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.productions() [S -> NP VP, NP -> D N, D -> 'the', N -> 'dog', VP -> V NP, V -> 'chased', NP -> D N, D -> 'the', N -> 'cat'] :rtype: list(Production) """ if not isinstance(self._label, string_types): raise TypeError('Productions can only be generated from trees having node labels that are strings') prods = [Production(Nonterminal(self._label), _child_names(self))] for child in self: if isinstance(child, Tree): prods += child.productions() return prods def pos(self): """ Return a sequence of pos-tagged words extracted from the tree. >>> t = Tree.fromstring("(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))") >>> t.pos() [('the', 'D'), ('dog', 'N'), ('chased', 'V'), ('the', 'D'), ('cat', 'N')] :return: a list of tuples containing leaves and pre-terminals (part-of-speech tags). The order reflects the order of the leaves in the tree's hierarchical structure. :rtype: list(tuple) """ pos = [] for child in self: if isinstance(child, Tree): pos.extend(child.pos()) else: pos.append((child, self._label)) return pos def leaf_treeposition(self, index): """ :return: The tree position of the ``index``-th leaf in this tree. I.e., if ``tp=self.leaf_treeposition(i)``, then ``self[tp]==self.leaves()[i]``. :raise IndexError: If this tree contains fewer than ``index+1`` leaves, or if ``index<0``. """ if index < 0: raise IndexError('index must be non-negative') stack = [(self, ())] while stack: value, treepos = stack.pop() if not isinstance(value, Tree): if index == 0: return treepos else: index -= 1 else: for i in range(len(value)-1, -1, -1): stack.append( (value[i], treepos+(i,)) ) raise IndexError('index must be less than or equal to len(self)') def treeposition_spanning_leaves(self, start, end): """ :return: The tree position of the lowest descendant of this tree that dominates ``self.leaves()[start:end]``. :raise ValueError: if ``end <= start`` """ if end <= start: raise ValueError('end must be greater than start') # Find the tree positions of the start & end leaves, and # take the longest common subsequence. start_treepos = self.leaf_treeposition(start) end_treepos = self.leaf_treeposition(end-1) # Find the first index where they mismatch: for i in range(len(start_treepos)): if i == len(end_treepos) or start_treepos[i] != end_treepos[i]: return start_treepos[:i] return start_treepos #//////////////////////////////////////////////////////////// # Transforms #//////////////////////////////////////////////////////////// def chomsky_normal_form(self, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"): """ This method can modify a tree in three ways: 1. Convert a tree into its Chomsky Normal Form (CNF) equivalent -- Every subtree has either two non-terminals or one terminal as its children. This process requires the creation of more"artificial" non-terminal nodes. 2. Markov (vertical) smoothing of children in new artificial nodes 3. Horizontal (parent) annotation of nodes :param factor: Right or left factoring method (default = "right") :type factor: str = [left|right] :param horzMarkov: Markov order for sibling smoothing in artificial nodes (None (default) = include all siblings) :type horzMarkov: int | None :param vertMarkov: Markov order for parent smoothing (0 (default) = no vertical annotation) :type vertMarkov: int | None :param childChar: A string used in construction of the artificial nodes, separating the head of the original subtree from the child nodes that have yet to be expanded (default = "|") :type childChar: str :param parentChar: A string used to separate the node representation from its vertical annotation :type parentChar: str """ from nltk.treetransforms import chomsky_normal_form chomsky_normal_form(self, factor, horzMarkov, vertMarkov, childChar, parentChar) def un_chomsky_normal_form(self, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"): """ This method modifies the tree in three ways: 1. Transforms a tree in Chomsky Normal Form back to its original structure (branching greater than two) 2. Removes any parent annotation (if it exists) 3. (optional) expands unary subtrees (if previously collapsed with collapseUnary(...) ) :param expandUnary: Flag to expand unary or not (default = True) :type expandUnary: bool :param childChar: A string separating the head node from its children in an artificial node (default = "|") :type childChar: str :param parentChar: A sting separating the node label from its parent annotation (default = "^") :type parentChar: str :param unaryChar: A string joining two non-terminals in a unary production (default = "+") :type unaryChar: str """ from nltk.treetransforms import un_chomsky_normal_form un_chomsky_normal_form(self, expandUnary, childChar, parentChar, unaryChar) def collapse_unary(self, collapsePOS = False, collapseRoot = False, joinChar = "+"): """ Collapse subtrees with a single child (ie. unary productions) into a new non-terminal (Tree node) joined by 'joinChar'. This is useful when working with algorithms that do not allow unary productions, and completely removing the unary productions would require loss of useful information. The Tree is modified directly (since it is passed by reference) and no value is returned. :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. Part-of-Speech tags) since they are always unary productions :type collapsePOS: bool :param collapseRoot: 'False' (default) will not modify the root production if it is unary. For the Penn WSJ treebank corpus, this corresponds to the TOP -> productions. :type collapseRoot: bool :param joinChar: A string used to connect collapsed node values (default = "+") :type joinChar: str """ from nltk.treetransforms import collapse_unary collapse_unary(self, collapsePOS, collapseRoot, joinChar) #//////////////////////////////////////////////////////////// # Convert, copy #//////////////////////////////////////////////////////////// @classmethod def convert(cls, tree): """ Convert a tree between different subtypes of Tree. ``cls`` determines which class will be used to encode the new tree. :type tree: Tree :param tree: The tree that should be converted. :return: The new Tree. """ if isinstance(tree, Tree): children = [cls.convert(child) for child in tree] return cls(tree._label, children) else: return tree def copy(self, deep=False): if not deep: return type(self)(self._label, self) else: return type(self).convert(self) def _frozen_class(self): return ImmutableTree def freeze(self, leaf_freezer=None): frozen_class = self._frozen_class() if leaf_freezer is None: newcopy = frozen_class.convert(self) else: newcopy = self.copy(deep=True) for pos in newcopy.treepositions('leaves'): newcopy[pos] = leaf_freezer(newcopy[pos]) newcopy = frozen_class.convert(newcopy) hash(newcopy) # Make sure the leaves are hashable. return newcopy #//////////////////////////////////////////////////////////// # Parsing #//////////////////////////////////////////////////////////// @classmethod def fromstring(cls, s, brackets='()', read_node=None, read_leaf=None, node_pattern=None, leaf_pattern=None, remove_empty_top_bracketing=False): """ Read a bracketed tree string and return the resulting tree. Trees are represented as nested brackettings, such as:: (S (NP (NNP John)) (VP (V runs))) :type s: str :param s: The string to read :type brackets: str (length=2) :param brackets: The bracket characters used to mark the beginning and end of trees and subtrees. :type read_node: function :type read_leaf: function :param read_node, read_leaf: If specified, these functions are applied to the substrings of ``s`` corresponding to nodes and leaves (respectively) to obtain the values for those nodes and leaves. They should have the following signature: read_node(str) -> value For example, these functions could be used to process nodes and leaves whose values should be some type other than string (such as ``FeatStruct``). Note that by default, node strings and leaf strings are delimited by whitespace and brackets; to override this default, use the ``node_pattern`` and ``leaf_pattern`` arguments. :type node_pattern: str :type leaf_pattern: str :param node_pattern, leaf_pattern: Regular expression patterns used to find node and leaf substrings in ``s``. By default, both nodes patterns are defined to match any sequence of non-whitespace non-bracket characters. :type remove_empty_top_bracketing: bool :param remove_empty_top_bracketing: If the resulting tree has an empty node label, and is length one, then return its single child instead. This is useful for treebank trees, which sometimes contain an extra level of bracketing. :return: A tree corresponding to the string representation ``s``. If this class method is called using a subclass of Tree, then it will return a tree of that type. :rtype: Tree """ if not isinstance(brackets, string_types) or len(brackets) != 2: raise TypeError('brackets must be a length-2 string') if re.search('\s', brackets): raise TypeError('whitespace brackets not allowed') # Construct a regexp that will tokenize the string. open_b, close_b = brackets open_pattern, close_pattern = (re.escape(open_b), re.escape(close_b)) if node_pattern is None: node_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern) if leaf_pattern is None: leaf_pattern = '[^\s%s%s]+' % (open_pattern, close_pattern) token_re = re.compile('%s\s*(%s)?|%s|(%s)' % ( open_pattern, node_pattern, close_pattern, leaf_pattern)) # Walk through each token, updating a stack of trees. stack = [(None, [])] # list of (node, children) tuples for match in token_re.finditer(s): token = match.group() # Beginning of a tree/subtree if token[0] == open_b: if len(stack) == 1 and len(stack[0][1]) > 0: cls._parse_error(s, match, 'end-of-string') label = token[1:].lstrip() if read_node is not None: label = read_node(label) stack.append((label, [])) # End of a tree/subtree elif token == close_b: if len(stack) == 1: if len(stack[0][1]) == 0: cls._parse_error(s, match, open_b) else: cls._parse_error(s, match, 'end-of-string') label, children = stack.pop() stack[-1][1].append(cls(label, children)) # Leaf node else: if len(stack) == 1: cls._parse_error(s, match, open_b) if read_leaf is not None: token = read_leaf(token) stack[-1][1].append(token) # check that we got exactly one complete tree. if len(stack) > 1: cls._parse_error(s, 'end-of-string', close_b) elif len(stack[0][1]) == 0: cls._parse_error(s, 'end-of-string', open_b) else: assert stack[0][0] is None assert len(stack[0][1]) == 1 tree = stack[0][1][0] # If the tree has an extra level with node='', then get rid of # it. E.g.: "((S (NP ...) (VP ...)))" if remove_empty_top_bracketing and tree._label == '' and len(tree) == 1: tree = tree[0] # return the tree. return tree @classmethod def _parse_error(cls, s, match, expecting): """ Display a friendly error message when parsing a tree string fails. :param s: The string we're parsing. :param match: regexp match of the problem token. :param expecting: what we expected to see instead. """ # Construct a basic error message if match == 'end-of-string': pos, token = len(s), 'end-of-string' else: pos, token = match.start(), match.group() msg = '%s.read(): expected %r but got %r\n%sat index %d.' % ( cls.__name__, expecting, token, ' '*12, pos) # Add a display showing the error token itsels: s = s.replace('\n', ' ').replace('\t', ' ') offset = pos if len(s) > pos+10: s = s[:pos+10]+'...' if pos > 10: s = '...'+s[pos-10:] offset = 13 msg += '\n%s"%s"\n%s^' % (' '*16, s, ' '*(17+offset)) raise ValueError(msg) #//////////////////////////////////////////////////////////// # Visualization & String Representation #//////////////////////////////////////////////////////////// def draw(self): """ Open a new window containing a graphical diagram of this tree. """ from nltk.draw.tree import draw_trees draw_trees(self) def pretty_print(self, sentence=None, highlight=(), stream=None, **kwargs): """ Pretty-print this tree as ASCII or Unicode art. For explanation of the arguments, see the documentation for `nltk.treeprettyprinter.TreePrettyPrinter`. """ from nltk.treeprettyprinter import TreePrettyPrinter print(TreePrettyPrinter(self, sentence, highlight).text(**kwargs), file=stream) def __repr__(self): childstr = ", ".join(unicode_repr(c) for c in self) return '%s(%s, [%s])' % (type(self).__name__, unicode_repr(self._label), childstr) def _repr_png_(self): """ Draws and outputs in PNG for ipython. PNG is used instead of PDF, since it can be displayed in the qt console and has wider browser support. """ import os import base64 import subprocess import tempfile from nltk.draw.tree import tree_to_treesegment from nltk.draw.util import CanvasFrame from nltk.internals import find_binary _canvas_frame = CanvasFrame() widget = tree_to_treesegment(_canvas_frame.canvas(), self) _canvas_frame.add_widget(widget) x, y, w, h = widget.bbox() # print_to_file uses scrollregion to set the width and height of the pdf. _canvas_frame.canvas()['scrollregion'] = (0, 0, w, h) with tempfile.NamedTemporaryFile() as file: in_path = '{0:}.ps'.format(file.name) out_path = '{0:}.png'.format(file.name) _canvas_frame.print_to_file(in_path) _canvas_frame.destroy_widget(widget) subprocess.call([find_binary('gs', binary_names=['gswin32c.exe', 'gswin64c.exe'], env_vars=['PATH'], verbose=False)] + '-q -dEPSCrop -sDEVICE=png16m -r90 -dTextAlphaBits=4 -dGraphicsAlphaBits=4 -dSAFER -dBATCH -dNOPAUSE -sOutputFile={0:} {1:}' .format(out_path, in_path).split()) with open(out_path, 'rb') as sr: res = sr.read() os.remove(in_path) os.remove(out_path) return base64.b64encode(res).decode() def __str__(self): return self.pformat() def pprint(self, **kwargs): """ Print a string representation of this Tree to 'stream' """ if "stream" in kwargs: stream = kwargs["stream"] del kwargs["stream"] else: stream = None print(self.pformat(**kwargs), file=stream) def pformat(self, margin=70, indent=0, nodesep='', parens='()', quotes=False): """ :return: A pretty-printed string representation of this tree. :rtype: str :param margin: The right margin at which to do line-wrapping. :type margin: int :param indent: The indentation level at which printing begins. This number is used to decide how far to indent subsequent lines. :type indent: int :param nodesep: A string that is used to separate the node from the children. E.g., the default value ``':'`` gives trees like ``(S: (NP: I) (VP: (V: saw) (NP: it)))``. """ # Try writing it on one line. s = self._pformat_flat(nodesep, parens, quotes) if len(s) + indent < margin: return s # If it doesn't fit on one line, then write it on multi-lines. if isinstance(self._label, string_types): s = '%s%s%s' % (parens[0], self._label, nodesep) else: s = '%s%s%s' % (parens[0], unicode_repr(self._label), nodesep) for child in self: if isinstance(child, Tree): s += '\n'+' '*(indent+2)+child.pformat(margin, indent+2, nodesep, parens, quotes) elif isinstance(child, tuple): s += '\n'+' '*(indent+2)+ "/".join(child) elif isinstance(child, string_types) and not quotes: s += '\n'+' '*(indent+2)+ '%s' % child else: s += '\n'+' '*(indent+2)+ unicode_repr(child) return s+parens[1] def pformat_latex_qtree(self): r""" Returns a representation of the tree compatible with the LaTeX qtree package. This consists of the string ``\Tree`` followed by the tree represented in bracketed notation. For example, the following result was generated from a parse tree of the sentence ``The announcement astounded us``:: \Tree [.I'' [.N'' [.D The ] [.N' [.N announcement ] ] ] [.I' [.V'' [.V' [.V astounded ] [.N'' [.N' [.N us ] ] ] ] ] ] ] See http://www.ling.upenn.edu/advice/latex.html for the LaTeX style file for the qtree package. :return: A latex qtree representation of this tree. :rtype: str """ reserved_chars = re.compile('([#\$%&~_\{\}])') pformat = self.pformat(indent=6, nodesep='', parens=('[.', ' ]')) return r'\Tree ' + re.sub(reserved_chars, r'\\\1', pformat) def _pformat_flat(self, nodesep, parens, quotes): childstrs = [] for child in self: if isinstance(child, Tree): childstrs.append(child._pformat_flat(nodesep, parens, quotes)) elif isinstance(child, tuple): childstrs.append("/".join(child)) elif isinstance(child, string_types) and not quotes: childstrs.append('%s' % child) else: childstrs.append(unicode_repr(child)) if isinstance(self._label, string_types): return '%s%s%s %s%s' % (parens[0], self._label, nodesep, " ".join(childstrs), parens[1]) else: return '%s%s%s %s%s' % (parens[0], unicode_repr(self._label), nodesep, " ".join(childstrs), parens[1]) class ImmutableTree(Tree): def __init__(self, node, children=None): super(ImmutableTree, self).__init__(node, children) # Precompute our hash value. This ensures that we're really # immutable. It also means we only have to calculate it once. try: self._hash = hash((self._label, tuple(self))) except (TypeError, ValueError): raise ValueError("%s: node value and children " "must be immutable" % type(self).__name__) def __setitem__(self, index, value): raise ValueError('%s may not be modified' % type(self).__name__) def __setslice__(self, i, j, value): raise ValueError('%s may not be modified' % type(self).__name__) def __delitem__(self, index): raise ValueError('%s may not be modified' % type(self).__name__) def __delslice__(self, i, j): raise ValueError('%s may not be modified' % type(self).__name__) def __iadd__(self, other): raise ValueError('%s may not be modified' % type(self).__name__) def __imul__(self, other): raise ValueError('%s may not be modified' % type(self).__name__) def append(self, v): raise ValueError('%s may not be modified' % type(self).__name__) def extend(self, v): raise ValueError('%s may not be modified' % type(self).__name__) def pop(self, v=None): raise ValueError('%s may not be modified' % type(self).__name__) def remove(self, v): raise ValueError('%s may not be modified' % type(self).__name__) def reverse(self): raise ValueError('%s may not be modified' % type(self).__name__) def sort(self): raise ValueError('%s may not be modified' % type(self).__name__) def __hash__(self): return self._hash def set_label(self, value): """ Set the node label. This will only succeed the first time the node label is set, which should occur in ImmutableTree.__init__(). """ if hasattr(self, '_label'): raise ValueError('%s may not be modified' % type(self).__name__) self._label = value ###################################################################### ## Parented trees ###################################################################### class AbstractParentedTree(Tree): """ An abstract base class for a ``Tree`` that automatically maintains pointers to parent nodes. These parent pointers are updated whenever any change is made to a tree's structure. Two subclasses are currently defined: - ``ParentedTree`` is used for tree structures where each subtree has at most one parent. This class should be used in cases where there is no"sharing" of subtrees. - ``MultiParentedTree`` is used for tree structures where a subtree may have zero or more parents. This class should be used in cases where subtrees may be shared. Subclassing =========== The ``AbstractParentedTree`` class redefines all operations that modify a tree's structure to call two methods, which are used by subclasses to update parent information: - ``_setparent()`` is called whenever a new child is added. - ``_delparent()`` is called whenever a child is removed. """ def __init__(self, node, children=None): super(AbstractParentedTree, self).__init__(node, children) # If children is None, the tree is read from node, and # all parents will be set during parsing. if children is not None: # Otherwise we have to set the parent of the children. # Iterate over self, and *not* children, because children # might be an iterator. for i, child in enumerate(self): if isinstance(child, Tree): self._setparent(child, i, dry_run=True) for i, child in enumerate(self): if isinstance(child, Tree): self._setparent(child, i) #//////////////////////////////////////////////////////////// # Parent management #//////////////////////////////////////////////////////////// def _setparent(self, child, index, dry_run=False): """ Update the parent pointer of ``child`` to point to ``self``. This method is only called if the type of ``child`` is ``Tree``; i.e., it is not called when adding a leaf to a tree. This method is always called before the child is actually added to the child list of ``self``. :type child: Tree :type index: int :param index: The index of ``child`` in ``self``. :raise TypeError: If ``child`` is a tree with an impropriate type. Typically, if ``child`` is a tree, then its type needs to match the type of ``self``. This prevents mixing of different tree types (single-parented, multi-parented, and non-parented). :param dry_run: If true, the don't actually set the child's parent pointer; just check for any error conditions, and raise an exception if one is found. """ raise NotImplementedError() def _delparent(self, child, index): """ Update the parent pointer of ``child`` to not point to self. This method is only called if the type of ``child`` is ``Tree``; i.e., it is not called when removing a leaf from a tree. This method is always called before the child is actually removed from the child list of ``self``. :type child: Tree :type index: int :param index: The index of ``child`` in ``self``. """ raise NotImplementedError() #//////////////////////////////////////////////////////////// # Methods that add/remove children #//////////////////////////////////////////////////////////// # Every method that adds or removes a child must make # appropriate calls to _setparent() and _delparent(). def __delitem__(self, index): # del ptree[start:stop] if isinstance(index, slice): start, stop, step = slice_bounds(self, index, allow_step=True) # Clear all the children pointers. for i in range(start, stop, step): if isinstance(self[i], Tree): self._delparent(self[i], i) # Delete the children from our child list. super(AbstractParentedTree, self).__delitem__(index) # del ptree[i] elif isinstance(index, int): if index < 0: index += len(self) if index < 0: raise IndexError('index out of range') # Clear the child's parent pointer. if isinstance(self[index], Tree): self._delparent(self[index], index) # Remove the child from our child list. super(AbstractParentedTree, self).__delitem__(index) elif isinstance(index, (list, tuple)): # del ptree[()] if len(index) == 0: raise IndexError('The tree position () may not be deleted.') # del ptree[(i,)] elif len(index) == 1: del self[index[0]] # del ptree[i1, i2, i3] else: del self[index[0]][index[1:]] else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__)) def __setitem__(self, index, value): # ptree[start:stop] = value if isinstance(index, slice): start, stop, step = slice_bounds(self, index, allow_step=True) # make a copy of value, in case it's an iterator if not isinstance(value, (list, tuple)): value = list(value) # Check for any error conditions, so we can avoid ending # up in an inconsistent state if an error does occur. for i, child in enumerate(value): if isinstance(child, Tree): self._setparent(child, start + i*step, dry_run=True) # clear the child pointers of all parents we're removing for i in range(start, stop, step): if isinstance(self[i], Tree): self._delparent(self[i], i) # set the child pointers of the new children. We do this # after clearing *all* child pointers, in case we're e.g. # reversing the elements in a tree. for i, child in enumerate(value): if isinstance(child, Tree): self._setparent(child, start + i*step) # finally, update the content of the child list itself. super(AbstractParentedTree, self).__setitem__(index, value) # ptree[i] = value elif isinstance(index, int): if index < 0: index += len(self) if index < 0: raise IndexError('index out of range') # if the value is not changing, do nothing. if value is self[index]: return # Set the new child's parent pointer. if isinstance(value, Tree): self._setparent(value, index) # Remove the old child's parent pointer if isinstance(self[index], Tree): self._delparent(self[index], index) # Update our child list. super(AbstractParentedTree, self).__setitem__(index, value) elif isinstance(index, (list, tuple)): # ptree[()] = value if len(index) == 0: raise IndexError('The tree position () may not be assigned to.') # ptree[(i,)] = value elif len(index) == 1: self[index[0]] = value # ptree[i1, i2, i3] = value else: self[index[0]][index[1:]] = value else: raise TypeError("%s indices must be integers, not %s" % (type(self).__name__, type(index).__name__)) def append(self, child): if isinstance(child, Tree): self._setparent(child, len(self)) super(AbstractParentedTree, self).append(child) def extend(self, children): for child in children: if isinstance(child, Tree): self._setparent(child, len(self)) super(AbstractParentedTree, self).append(child) def insert(self, index, child): # Handle negative indexes. Note that if index < -len(self), # we do *not* raise an IndexError, unlike __getitem__. This # is done for consistency with list.__getitem__ and list.index. if index < 0: index += len(self) if index < 0: index = 0 # Set the child's parent, and update our child list. if isinstance(child, Tree): self._setparent(child, index) super(AbstractParentedTree, self).insert(index, child) def pop(self, index=-1): if index < 0: index += len(self) if index < 0: raise IndexError('index out of range') if isinstance(self[index], Tree): self._delparent(self[index], index) return super(AbstractParentedTree, self).pop(index) # n.b.: like `list`, this is done by equality, not identity! # To remove a specific child, use del ptree[i]. def remove(self, child): index = self.index(child) if isinstance(self[index], Tree): self._delparent(self[index], index) super(AbstractParentedTree, self).remove(child) # We need to implement __getslice__ and friends, even though # they're deprecated, because otherwise list.__getslice__ will get # called (since we're subclassing from list). Just delegate to # __getitem__ etc., but use max(0, start) and max(0, stop) because # because negative indices are already handled *before* # __getslice__ is called; and we don't want to double-count them. if hasattr(list, '__getslice__'): def __getslice__(self, start, stop): return self.__getitem__(slice(max(0, start), max(0, stop))) def __delslice__(self, start, stop): return self.__delitem__(slice(max(0, start), max(0, stop))) def __setslice__(self, start, stop, value): return self.__setitem__(slice(max(0, start), max(0, stop)), value) class ParentedTree(AbstractParentedTree): """ A ``Tree`` that automatically maintains parent pointers for single-parented trees. The following are methods for querying the structure of a parented tree: ``parent``, ``parent_index``, ``left_sibling``, ``right_sibling``, ``root``, ``treeposition``. Each ``ParentedTree`` may have at most one parent. In particular, subtrees may not be shared. Any attempt to reuse a single ``ParentedTree`` as a child of more than one parent (or as multiple children of the same parent) will cause a ``ValueError`` exception to be raised. ``ParentedTrees`` should never be used in the same tree as ``Trees`` or ``MultiParentedTrees``. Mixing tree implementations may result in incorrect parent pointers and in ``TypeError`` exceptions. """ def __init__(self, node, children=None): self._parent = None """The parent of this Tree, or None if it has no parent.""" super(ParentedTree, self).__init__(node, children) if children is None: # If children is None, the tree is read from node. # After parsing, the parent of the immediate children # will point to an intermediate tree, not self. # We fix this by brute force: for i, child in enumerate(self): if isinstance(child, Tree): child._parent = None self._setparent(child, i) def _frozen_class(self): return ImmutableParentedTree #///////////////////////////////////////////////////////////////// # Methods #///////////////////////////////////////////////////////////////// def parent(self): """The parent of this tree, or None if it has no parent.""" return self._parent def parent_index(self): """ The index of this tree in its parent. I.e., ``ptree.parent()[ptree.parent_index()] is ptree``. Note that ``ptree.parent_index()`` is not necessarily equal to ``ptree.parent.index(ptree)``, since the ``index()`` method returns the first child that is equal to its argument. """ if self._parent is None: return None for i, child in enumerate(self._parent): if child is self: return i assert False, 'expected to find self in self._parent!' def left_sibling(self): """The left sibling of this tree, or None if it has none.""" parent_index = self.parent_index() if self._parent and parent_index > 0: return self._parent[parent_index-1] return None # no left sibling def right_sibling(self): """The right sibling of this tree, or None if it has none.""" parent_index = self.parent_index() if self._parent and parent_index < (len(self._parent)-1): return self._parent[parent_index+1] return None # no right sibling def root(self): """ The root of this tree. I.e., the unique ancestor of this tree whose parent is None. If ``ptree.parent()`` is None, then ``ptree`` is its own root. """ root = self while root.parent() is not None: root = root.parent() return root def treeposition(self): """ The tree position of this tree, relative to the root of the tree. I.e., ``ptree.root[ptree.treeposition] is ptree``. """ if self.parent() is None: return () else: return self.parent().treeposition() + (self.parent_index(),) #///////////////////////////////////////////////////////////////// # Parent Management #///////////////////////////////////////////////////////////////// def _delparent(self, child, index): # Sanity checks assert isinstance(child, ParentedTree) assert self[index] is child assert child._parent is self # Delete child's parent pointer. child._parent = None def _setparent(self, child, index, dry_run=False): # If the child's type is incorrect, then complain. if not isinstance(child, ParentedTree): raise TypeError('Can not insert a non-ParentedTree '+ 'into a ParentedTree') # If child already has a parent, then complain. if child._parent is not None: raise ValueError('Can not insert a subtree that already ' 'has a parent.') # Set child's parent pointer & index. if not dry_run: child._parent = self class MultiParentedTree(AbstractParentedTree): """ A ``Tree`` that automatically maintains parent pointers for multi-parented trees. The following are methods for querying the structure of a multi-parented tree: ``parents()``, ``parent_indices()``, ``left_siblings()``, ``right_siblings()``, ``roots``, ``treepositions``. Each ``MultiParentedTree`` may have zero or more parents. In particular, subtrees may be shared. If a single ``MultiParentedTree`` is used as multiple children of the same parent, then that parent will appear multiple times in its ``parents()`` method. ``MultiParentedTrees`` should never be used in the same tree as ``Trees`` or ``ParentedTrees``. Mixing tree implementations may result in incorrect parent pointers and in ``TypeError`` exceptions. """ def __init__(self, node, children=None): self._parents = [] """A list of this tree's parents. This list should not contain duplicates, even if a parent contains this tree multiple times.""" super(MultiParentedTree, self).__init__(node, children) if children is None: # If children is None, the tree is read from node. # After parsing, the parent(s) of the immediate children # will point to an intermediate tree, not self. # We fix this by brute force: for i, child in enumerate(self): if isinstance(child, Tree): child._parents = [] self._setparent(child, i) def _frozen_class(self): return ImmutableMultiParentedTree #///////////////////////////////////////////////////////////////// # Methods #///////////////////////////////////////////////////////////////// def parents(self): """ The set of parents of this tree. If this tree has no parents, then ``parents`` is the empty set. To check if a tree is used as multiple children of the same parent, use the ``parent_indices()`` method. :type: list(MultiParentedTree) """ return list(self._parents) def left_siblings(self): """ A list of all left siblings of this tree, in any of its parent trees. A tree may be its own left sibling if it is used as multiple contiguous children of the same parent. A tree may appear multiple times in this list if it is the left sibling of this tree with respect to multiple parents. :type: list(MultiParentedTree) """ return [parent[index-1] for (parent, index) in self._get_parent_indices() if index > 0] def right_siblings(self): """ A list of all right siblings of this tree, in any of its parent trees. A tree may be its own right sibling if it is used as multiple contiguous children of the same parent. A tree may appear multiple times in this list if it is the right sibling of this tree with respect to multiple parents. :type: list(MultiParentedTree) """ return [parent[index+1] for (parent, index) in self._get_parent_indices() if index < (len(parent)-1)] def _get_parent_indices(self): return [(parent, index) for parent in self._parents for index, child in enumerate(parent) if child is self] def roots(self): """ The set of all roots of this tree. This set is formed by tracing all possible parent paths until trees with no parents are found. :type: list(MultiParentedTree) """ return list(self._get_roots_helper({}).values()) def _get_roots_helper(self, result): if self._parents: for parent in self._parents: parent._get_roots_helper(result) else: result[id(self)] = self return result def parent_indices(self, parent): """ Return a list of the indices where this tree occurs as a child of ``parent``. If this child does not occur as a child of ``parent``, then the empty list is returned. The following is always true:: for parent_index in ptree.parent_indices(parent): parent[parent_index] is ptree """ if parent not in self._parents: return [] else: return [index for (index, child) in enumerate(parent) if child is self] def treepositions(self, root): """ Return a list of all tree positions that can be used to reach this multi-parented tree starting from ``root``. I.e., the following is always true:: for treepos in ptree.treepositions(root): root[treepos] is ptree """ if self is root: return [()] else: return [treepos+(index,) for parent in self._parents for treepos in parent.treepositions(root) for (index, child) in enumerate(parent) if child is self] #///////////////////////////////////////////////////////////////// # Parent Management #///////////////////////////////////////////////////////////////// def _delparent(self, child, index): # Sanity checks assert isinstance(child, MultiParentedTree) assert self[index] is child assert len([p for p in child._parents if p is self]) == 1 # If the only copy of child in self is at index, then delete # self from child's parent list. for i, c in enumerate(self): if c is child and i != index: break else: child._parents.remove(self) def _setparent(self, child, index, dry_run=False): # If the child's type is incorrect, then complain. if not isinstance(child, MultiParentedTree): raise TypeError('Can not insert a non-MultiParentedTree '+ 'into a MultiParentedTree') # Add self as a parent pointer if it's not already listed. if not dry_run: for parent in child._parents: if parent is self: break else: child._parents.append(self) class ImmutableParentedTree(ImmutableTree, ParentedTree): pass class ImmutableMultiParentedTree(ImmutableTree, MultiParentedTree): pass ###################################################################### ## Probabilistic trees ###################################################################### @python_2_unicode_compatible class ProbabilisticTree(Tree, ProbabilisticMixIn): def __init__(self, node, children=None, **prob_kwargs): Tree.__init__(self, node, children) ProbabilisticMixIn.__init__(self, **prob_kwargs) # We have to patch up these methods to make them work right: def _frozen_class(self): return ImmutableProbabilisticTree def __repr__(self): return '%s (p=%r)' % (Tree.unicode_repr(self), self.prob()) def __str__(self): return '%s (p=%.6g)' % (self.pformat(margin=60), self.prob()) def copy(self, deep=False): if not deep: return type(self)(self._label, self, prob=self.prob()) else: return type(self).convert(self) @classmethod def convert(cls, val): if isinstance(val, Tree): children = [cls.convert(child) for child in val] if isinstance(val, ProbabilisticMixIn): return cls(val._label, children, prob=val.prob()) else: return cls(val._label, children, prob=1.0) else: return val def __eq__(self, other): return (self.__class__ is other.__class__ and (self._label, list(self), self.prob()) == (other._label, list(other), other.prob())) def __lt__(self, other): if not isinstance(other, Tree): raise_unorderable_types("<", self, other) if self.__class__ is other.__class__: return ((self._label, list(self), self.prob()) < (other._label, list(other), other.prob())) else: return self.__class__.__name__ < other.__class__.__name__ @python_2_unicode_compatible class ImmutableProbabilisticTree(ImmutableTree, ProbabilisticMixIn): def __init__(self, node, children=None, **prob_kwargs): ImmutableTree.__init__(self, node, children) ProbabilisticMixIn.__init__(self, **prob_kwargs) self._hash = hash((self._label, tuple(self), self.prob())) # We have to patch up these methods to make them work right: def _frozen_class(self): return ImmutableProbabilisticTree def __repr__(self): return '%s [%s]' % (Tree.unicode_repr(self), self.prob()) def __str__(self): return '%s [%s]' % (self.pformat(margin=60), self.prob()) def copy(self, deep=False): if not deep: return type(self)(self._label, self, prob=self.prob()) else: return type(self).convert(self) @classmethod def convert(cls, val): if isinstance(val, Tree): children = [cls.convert(child) for child in val] if isinstance(val, ProbabilisticMixIn): return cls(val._label, children, prob=val.prob()) else: return cls(val._label, children, prob=1.0) else: return val def _child_names(tree): names = [] for child in tree: if isinstance(child, Tree): names.append(Nonterminal(child._label)) else: names.append(child) return names ###################################################################### ## Parsing ###################################################################### def bracket_parse(s): """ Use Tree.read(s, remove_empty_top_bracketing=True) instead. """ raise NameError("Use Tree.read(s, remove_empty_top_bracketing=True) instead.") def sinica_parse(s): """ Parse a Sinica Treebank string and return a tree. Trees are represented as nested brackettings, as shown in the following example (X represents a Chinese character): S(goal:NP(Head:Nep:XX)|theme:NP(Head:Nhaa:X)|quantity:Dab:X|Head:VL2:X)#0(PERIODCATEGORY) :return: A tree corresponding to the string representation. :rtype: Tree :param s: The string to be converted :type s: str """ tokens = re.split(r'([()| ])', s) for i in range(len(tokens)): if tokens[i] == '(': tokens[i-1], tokens[i] = tokens[i], tokens[i-1] # pull nonterminal inside parens elif ':' in tokens[i]: fields = tokens[i].split(':') if len(fields) == 2: # non-terminal tokens[i] = fields[1] else: tokens[i] = "(" + fields[-2] + " " + fields[-1] + ")" elif tokens[i] == '|': tokens[i] = '' treebank_string = " ".join(tokens) return Tree.fromstring(treebank_string, remove_empty_top_bracketing=True) # s = re.sub(r'^#[^\s]*\s', '', s) # remove leading identifier # s = re.sub(r'\w+:', '', s) # remove role tags # return s ###################################################################### ## Demonstration ###################################################################### def demo(): """ A demonstration showing how Trees and Trees can be used. This demonstration creates a Tree, and loads a Tree from the Treebank corpus, and shows the results of calling several of their methods. """ from nltk import Tree, ProbabilisticTree # Demonstrate tree parsing. s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))' t = Tree.fromstring(s) print("Convert bracketed string into tree:") print(t) print(t.__repr__()) print("Display tree properties:") print(t.label()) # tree's constituent type print(t[0]) # tree's first child print(t[1]) # tree's second child print(t.height()) print(t.leaves()) print(t[1]) print(t[1,1]) print(t[1,1,0]) # Demonstrate tree modification. the_cat = t[0] the_cat.insert(1, Tree.fromstring('(JJ big)')) print("Tree modification:") print(t) t[1,1,1] = Tree.fromstring('(NN cake)') print(t) print() # Tree transforms print("Collapse unary:") t.collapse_unary() print(t) print("Chomsky normal form:") t.chomsky_normal_form() print(t) print() # Demonstrate probabilistic trees. pt = ProbabilisticTree('x', ['y', 'z'], prob=0.5) print("Probabilistic Tree:") print(pt) print() # Demonstrate parsing of treebank output format. t = Tree.fromstring(t.pformat()) print("Convert tree to bracketed string and back again:") print(t) print() # Demonstrate LaTeX output print("LaTeX output:") print(t.pformat_latex_qtree()) print() # Demonstrate Productions print("Production output:") print(t.productions()) print() # Demonstrate tree nodes containing objects other than strings t.set_label(('test', 3)) print(t) __all__ = ['ImmutableProbabilisticTree', 'ImmutableTree', 'ProbabilisticMixIn', 'ProbabilisticTree', 'Tree', 'bracket_parse', 'sinica_parse', 'ParentedTree', 'MultiParentedTree', 'ImmutableParentedTree', 'ImmutableMultiParentedTree'] nltk-3.1/nltk/treeprettyprinter.py0000644000076500000240000005745012607224144017226 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: ASCII visualization of NLTK trees # # Copyright (C) 2001-2015 NLTK Project # Author: Andreas van Cranenburgh # Peter Ljunglöf # URL: # For license information, see LICENSE.TXT """ Pretty-printing of discontinuous trees. Adapted from the disco-dop project, by Andreas van Cranenburgh. https://github.com/andreasvc/disco-dop Interesting reference (not used for this code): T. Eschbach et al., Orth. Hypergraph Drawing, Journal of Graph Algorithms and Applications, 10(2) 141--157 (2006)149. http://jgaa.info/accepted/2006/EschbachGuentherBecker2006.10.2.pdf """ from __future__ import division, print_function, unicode_literals from nltk.util import slice_bounds, OrderedDict from nltk.compat import string_types, python_2_unicode_compatible, unicode_repr from nltk.internals import raise_unorderable_types from nltk.tree import Tree import re import sys import codecs from cgi import escape from collections import defaultdict from operator import itemgetter from itertools import chain, islice ANSICOLOR = { 'black': 30, 'red': 31, 'green': 32, 'yellow': 33, 'blue': 34, 'magenta': 35, 'cyan': 36, 'white': 37, } @python_2_unicode_compatible class TreePrettyPrinter(object): """ Pretty-print a tree in text format, either as ASCII or Unicode. The tree can be a normal tree, or discontinuous. ``TreePrettyPrinter(tree, sentence=None, highlight=())`` creates an object from which different visualizations can be created. :param tree: a Tree object. :param sentence: a list of words (strings). If `sentence` is given, `tree` must contain integers as leaves, which are taken as indices in `sentence`. Using this you can display a discontinuous tree. :param highlight: Optionally, a sequence of Tree objects in `tree` which should be highlighted. Has the effect of only applying colors to nodes in this sequence (nodes should be given as Tree objects, terminals as indices). >>> from nltk.tree import Tree >>> tree = Tree.fromstring('(S (NP Mary) (VP walks))') >>> print(TreePrettyPrinter(tree).text()) ... # doctest: +NORMALIZE_WHITESPACE S ____|____ NP VP | | Mary walks """ def __init__(self, tree, sentence=None, highlight=()): if sentence is None: leaves = tree.leaves() if (leaves and not any(len(a) == 0 for a in tree.subtrees()) and all(isinstance(a, int) for a in leaves)): sentence = [str(a) for a in leaves] else: # this deals with empty nodes (frontier non-terminals) # and multiple/mixed terminals under non-terminals. tree = tree.copy(True) sentence = [] for a in tree.subtrees(): if len(a) == 0: a.append(len(sentence)) sentence.append(None) elif any(not isinstance(b, Tree) for b in a): for n, b in enumerate(a): if not isinstance(b, Tree): a[n] = len(sentence) sentence.append('%s' % b) self.nodes, self.coords, self.edges, self.highlight = self.nodecoords( tree, sentence, highlight) def __str__(self): return self.text() def __repr__(self): return '' % len(self.nodes) @staticmethod def nodecoords(tree, sentence, highlight): """ Produce coordinates of nodes on a grid. Objective: - Produce coordinates for a non-overlapping placement of nodes and horizontal lines. - Order edges so that crossing edges cross a minimal number of previous horizontal lines (never vertical lines). Approach: - bottom up level order traversal (start at terminals) - at each level, identify nodes which cannot be on the same row - identify nodes which cannot be in the same column - place nodes into a grid at (row, column) - order child-parent edges with crossing edges last Coordinates are (row, column); the origin (0, 0) is at the top left; the root node is on row 0. Coordinates do not consider the size of a node (which depends on font, &c), so the width of a column of the grid should be automatically determined by the element with the greatest width in that column. Alternatively, the integer coordinates could be converted to coordinates in which the distances between adjacent nodes are non-uniform. Produces tuple (nodes, coords, edges, highlighted) where: - nodes[id]: Tree object for the node with this integer id - coords[id]: (n, m) coordinate where to draw node with id in the grid - edges[id]: parent id of node with this id (ordered dictionary) - highlighted: set of ids that should be highlighted """ def findcell(m, matrix, startoflevel, children): """ Find vacant row, column index for node ``m``. Iterate over current rows for this level (try lowest first) and look for cell between first and last child of this node, add new row to level if no free row available. """ candidates = [a for _, a in children[m]] minidx, maxidx = min(candidates), max(candidates) leaves = tree[m].leaves() center = scale * sum(leaves) // len(leaves) # center of gravity if minidx < maxidx and not minidx < center < maxidx: center = sum(candidates) // len(candidates) if max(candidates) - min(candidates) > 2 * scale: center -= center % scale # round to unscaled coordinate if minidx < maxidx and not minidx < center < maxidx: center += scale if ids[m] == 0: startoflevel = len(matrix) for rowidx in range(startoflevel, len(matrix) + 1): if rowidx == len(matrix): # need to add a new row matrix.append([vertline if a not in (corner, None) else None for a in matrix[-1]]) row = matrix[rowidx] i = j = center if len(children[m]) == 1: # place unaries directly above child return rowidx, next(iter(children[m]))[1] elif all(a is None or a == vertline for a in row[min(candidates):max(candidates) + 1]): # find free column for n in range(scale): i = j = center + n while j > minidx or i < maxidx: if i < maxidx and (matrix[rowidx][i] is None or i in candidates): return rowidx, i elif j > minidx and (matrix[rowidx][j] is None or j in candidates): return rowidx, j i += scale j -= scale raise ValueError('could not find a free cell for:\n%s\n%s' 'min=%d; max=%d' % (tree[m], minidx, maxidx, dumpmatrix())) def dumpmatrix(): """Dump matrix contents for debugging purposes.""" return '\n'.join( '%2d: %s' % (n, ' '.join(('%2r' % i)[:2] for i in row)) for n, row in enumerate(matrix)) leaves = tree.leaves() if not all(isinstance(n, int) for n in leaves): raise ValueError('All leaves must be integer indices.') if len(leaves) != len(set(leaves)): raise ValueError('Indices must occur at most once.') if not all(0 <= n < len(sentence) for n in leaves): raise ValueError('All leaves must be in the interval 0..n ' 'with n=len(sentence)\ntokens: %d indices: ' '%r\nsentence: %s' % (len(sentence), tree.leaves(), sentence)) vertline, corner = -1, -2 # constants tree = tree.copy(True) for a in tree.subtrees(): a.sort(key=lambda n: min(n.leaves()) if isinstance(n, Tree) else n) scale = 2 crossed = set() # internal nodes and lexical nodes (no frontiers) positions = tree.treepositions() maxdepth = max(map(len, positions)) + 1 childcols = defaultdict(set) matrix = [[None] * (len(sentence) * scale)] nodes = {} ids = dict((a, n) for n, a in enumerate(positions)) highlighted_nodes = set(n for a, n in ids.items() if not highlight or tree[a] in highlight) levels = dict((n, []) for n in range(maxdepth - 1)) terminals = [] for a in positions: node = tree[a] if isinstance(node, Tree): levels[maxdepth - node.height()].append(a) else: terminals.append(a) for n in levels: levels[n].sort(key=lambda n: max(tree[n].leaves()) - min(tree[n].leaves())) terminals.sort() positions = set(positions) for m in terminals: i = int(tree[m]) * scale assert matrix[0][i] is None, (matrix[0][i], m, i) matrix[0][i] = ids[m] nodes[ids[m]] = sentence[tree[m]] if nodes[ids[m]] is None: nodes[ids[m]] = '...' highlighted_nodes.discard(ids[m]) positions.remove(m) childcols[m[:-1]].add((0, i)) # add other nodes centered on their children, # if the center is already taken, back off # to the left and right alternately, until an empty cell is found. for n in sorted(levels, reverse=True): nodesatdepth = levels[n] startoflevel = len(matrix) matrix.append([vertline if a not in (corner, None) else None for a in matrix[-1]]) for m in nodesatdepth: # [::-1]: if n < maxdepth - 1 and childcols[m]: _, pivot = min(childcols[m], key=itemgetter(1)) if (set(a[:-1] for row in matrix[:-1] for a in row[:pivot] if isinstance(a, tuple)) & set(a[:-1] for row in matrix[:-1] for a in row[pivot:] if isinstance(a, tuple))): crossed.add(m) rowidx, i = findcell(m, matrix, startoflevel, childcols) positions.remove(m) # block positions where children of this node branch out for _, x in childcols[m]: matrix[rowidx][x] = corner # assert m == () or matrix[rowidx][i] in (None, corner), ( # matrix[rowidx][i], m, str(tree), ' '.join(sentence)) # node itself matrix[rowidx][i] = ids[m] nodes[ids[m]] = tree[m] # add column to the set of children for its parent if m != (): childcols[m[:-1]].add((rowidx, i)) assert len(positions) == 0 # remove unused columns, right to left for m in range(scale * len(sentence) - 1, -1, -1): if not any(isinstance(row[m], (Tree, int)) for row in matrix): for row in matrix: del row[m] # remove unused rows, reverse matrix = [row for row in reversed(matrix) if not all(a is None or a == vertline for a in row)] # collect coordinates of nodes coords = {} for n, _ in enumerate(matrix): for m, i in enumerate(matrix[n]): if isinstance(i, int) and i >= 0: coords[i] = n, m # move crossed edges last positions = sorted([a for level in levels.values() for a in level], key=lambda a: a[:-1] in crossed) # collect edges from node to node edges = OrderedDict() for i in reversed(positions): for j, _ in enumerate(tree[i]): edges[ids[i + (j, )]] = ids[i] return nodes, coords, edges, highlighted_nodes def text(self, nodedist=1, unicodelines=False, html=False, ansi=False, nodecolor='blue', leafcolor='red', funccolor='green', abbreviate=None, maxwidth=16): """ :return: ASCII art for a discontinuous tree. :param unicodelines: whether to use Unicode line drawing characters instead of plain (7-bit) ASCII. :param html: whether to wrap output in html code (default plain text). :param ansi: whether to produce colors with ANSI escape sequences (only effective when html==False). :param leafcolor, nodecolor: specify colors of leaves and phrasal nodes; effective when either html or ansi is True. :param abbreviate: if True, abbreviate labels longer than 5 characters. If integer, abbreviate labels longer than `abbr` characters. :param maxwidth: maximum number of characters before a label starts to wrap; pass None to disable. """ if abbreviate == True: abbreviate = 5 if unicodelines: horzline = '\u2500' leftcorner = '\u250c' rightcorner = '\u2510' vertline = ' \u2502 ' tee = horzline + '\u252C' + horzline bottom = horzline + '\u2534' + horzline cross = horzline + '\u253c' + horzline ellipsis = '\u2026' else: horzline = '_' leftcorner = rightcorner = ' ' vertline = ' | ' tee = 3 * horzline cross = bottom = '_|_' ellipsis = '.' def crosscell(cur, x=vertline): """Overwrite center of this cell with a vertical branch.""" splitl = len(cur) - len(cur) // 2 - len(x) // 2 - 1 lst = list(cur) lst[splitl:splitl + len(x)] = list(x) return ''.join(lst) result = [] matrix = defaultdict(dict) maxnodewith = defaultdict(lambda: 3) maxnodeheight = defaultdict(lambda: 1) maxcol = 0 minchildcol = {} maxchildcol = {} childcols = defaultdict(set) labels = {} wrapre = re.compile('(.{%d,%d}\\b\\W*|.{%d})' % ( maxwidth - 4, maxwidth, maxwidth)) # collect labels and coordinates for a in self.nodes: row, column = self.coords[a] matrix[row][column] = a maxcol = max(maxcol, column) label = (self.nodes[a].label() if isinstance(self.nodes[a], Tree) else self.nodes[a]) if abbreviate and len(label) > abbreviate: label = label[:abbreviate] + ellipsis if maxwidth and len(label) > maxwidth: label = wrapre.sub(r'\1\n', label).strip() label = label.split('\n') maxnodeheight[row] = max(maxnodeheight[row], len(label)) maxnodewith[column] = max(maxnodewith[column], max(map(len, label))) labels[a] = label if a not in self.edges: continue # e.g., root parent = self.edges[a] childcols[parent].add((row, column)) minchildcol[parent] = min(minchildcol.get(parent, column), column) maxchildcol[parent] = max(maxchildcol.get(parent, column), column) # bottom up level order traversal for row in sorted(matrix, reverse=True): noderows = [[''.center(maxnodewith[col]) for col in range(maxcol + 1)] for _ in range(maxnodeheight[row])] branchrow = [''.center(maxnodewith[col]) for col in range(maxcol + 1)] for col in matrix[row]: n = matrix[row][col] node = self.nodes[n] text = labels[n] if isinstance(node, Tree): # draw horizontal branch towards children for this node if n in minchildcol and minchildcol[n] < maxchildcol[n]: i, j = minchildcol[n], maxchildcol[n] a, b = (maxnodewith[i] + 1) // 2 - 1, maxnodewith[j] // 2 branchrow[i] = ((' ' * a) + leftcorner).ljust( maxnodewith[i], horzline) branchrow[j] = (rightcorner + (' ' * b)).rjust( maxnodewith[j], horzline) for i in range(minchildcol[n] + 1, maxchildcol[n]): if i == col and any( a == i for _, a in childcols[n]): line = cross elif i == col: line = bottom elif any(a == i for _, a in childcols[n]): line = tee else: line = horzline branchrow[i] = line.center(maxnodewith[i], horzline) else: # if n and n in minchildcol: branchrow[col] = crosscell(branchrow[col]) text = [a.center(maxnodewith[col]) for a in text] color = nodecolor if isinstance(node, Tree) else leafcolor if isinstance(node, Tree) and node.label().startswith('-'): color = funccolor if html: text = [escape(a) for a in text] if n in self.highlight: text = ['%s' % ( color, a) for a in text] elif ansi and n in self.highlight: text = ['\x1b[%d;1m%s\x1b[0m' % ( ANSICOLOR[color], a) for a in text] for x in range(maxnodeheight[row]): # draw vertical lines in partially filled multiline node # labels, but only if it's not a frontier node. noderows[x][col] = (text[x] if x < len(text) else (vertline if childcols[n] else ' ').center( maxnodewith[col], ' ')) # for each column, if there is a node below us which has a parent # above us, draw a vertical branch in that column. if row != max(matrix): for n, (childrow, col) in self.coords.items(): if (n > 0 and self.coords[self.edges[n]][0] < row < childrow): branchrow[col] = crosscell(branchrow[col]) if col not in matrix[row]: for noderow in noderows: noderow[col] = crosscell(noderow[col]) branchrow = [a + ((a[-1] if a[-1] != ' ' else b[0]) * nodedist) for a, b in zip(branchrow, branchrow[1:] + [' '])] result.append(''.join(branchrow)) result.extend((' ' * nodedist).join(noderow) for noderow in reversed(noderows)) return '\n'.join(reversed(result)) + '\n' def svg(self, nodecolor='blue', leafcolor='red', funccolor='green'): """ :return: SVG representation of a tree. """ fontsize = 12 hscale = 40 vscale = 25 hstart = vstart = 20 width = max(col for _, col in self.coords.values()) height = max(row for row, _ in self.coords.values()) result = ['' % ( width * 3, height * 2.5, -hstart, -vstart, width * hscale + 3 * hstart, height * vscale + 3 * vstart) ] children = defaultdict(set) for n in self.nodes: if n: children[self.edges[n]].add(n) # horizontal branches from nodes to children for node in self.nodes: if not children[node]: continue y, x = self.coords[node] x *= hscale y *= vscale x += hstart y += vstart + fontsize // 2 childx = [self.coords[c][1] for c in children[node]] xmin = hstart + hscale * min(childx) xmax = hstart + hscale * max(childx) result.append( '\t' % (xmin, y, xmax, y)) result.append( '\t' % (x, y, x, y - fontsize // 3)) # vertical branches from children to parents for child, parent in self.edges.items(): y, _ = self.coords[parent] y *= vscale y += vstart + fontsize // 2 childy, childx = self.coords[child] childx *= hscale childy *= vscale childx += hstart childy += vstart - fontsize result += [ '\t' % (childx, childy, childx, y + 5), '\t' % (childx, childy, childx, y), ] # write nodes with coordinates for n, (row, column) in self.coords.items(): node = self.nodes[n] x = column * hscale + hstart y = row * vscale + vstart if n in self.highlight: color = nodecolor if isinstance(node, Tree) else leafcolor if isinstance(node, Tree) and node.label().startswith('-'): color = funccolor else: color = 'black' result += ['\t%s' % ( color, fontsize, x, y, escape(node.label() if isinstance(node, Tree) else node))] result += [''] return '\n'.join(result) def test(): """Do some tree drawing tests.""" def print_tree(n, tree, sentence=None, ansi=True, **xargs): print() print('{0}: "{1}"'.format(n, ' '.join(sentence or tree.leaves()))) print(tree) print() drawtree = TreePrettyPrinter(tree, sentence) try: print(drawtree.text(unicodelines=ansi, ansi=ansi, **xargs)) except (UnicodeDecodeError, UnicodeEncodeError): print(drawtree.text(unicodelines=False, ansi=False, **xargs)) from nltk.corpus import treebank for n in [0, 1440, 1591, 2771, 2170]: tree = treebank.parsed_sents()[n] print_tree(n, tree, nodedist=2, maxwidth=8) print() print('ASCII version:') print(TreePrettyPrinter(tree).text(nodedist=2)) tree = Tree.fromstring( '(top (punct 8) (smain (noun 0) (verb 1) (inf (verb 5) (inf (verb 6) ' '(conj (inf (pp (prep 2) (np (det 3) (noun 4))) (verb 7)) (inf (verb 9)) ' '(vg 10) (inf (verb 11)))))) (punct 12))', read_leaf=int) sentence = ('Ze had met haar moeder kunnen gaan winkelen ,' ' zwemmen of terrassen .'.split()) print_tree('Discontinuous tree', tree, sentence, nodedist=2) __all__ = ['TreePrettyPrinter'] if __name__ == '__main__': test() nltk-3.1/nltk/treetransforms.py0000644000076500000240000003111312574600335016460 0ustar sbstaff00000000000000# Natural Language Toolkit: Tree Transformations # # Copyright (C) 2005-2007 Oregon Graduate Institute # Author: Nathan Bodenstab # URL: # For license information, see LICENSE.TXT """ A collection of methods for tree (grammar) transformations used in parsing natural language. Although many of these methods are technically grammar transformations (ie. Chomsky Norm Form), when working with treebanks it is much more natural to visualize these modifications in a tree structure. Hence, we will do all transformation directly to the tree itself. Transforming the tree directly also allows us to do parent annotation. A grammar can then be simply induced from the modified tree. The following is a short tutorial on the available transformations. 1. Chomsky Normal Form (binarization) It is well known that any grammar has a Chomsky Normal Form (CNF) equivalent grammar where CNF is defined by every production having either two non-terminals or one terminal on its right hand side. When we have hierarchically structured data (ie. a treebank), it is natural to view this in terms of productions where the root of every subtree is the head (left hand side) of the production and all of its children are the right hand side constituents. In order to convert a tree into CNF, we simply need to ensure that every subtree has either two subtrees as children (binarization), or one leaf node (non-terminal). In order to binarize a subtree with more than two children, we must introduce artificial nodes. There are two popular methods to convert a tree into CNF: left factoring and right factoring. The following example demonstrates the difference between them. Example:: Original Right-Factored Left-Factored A A A / | \ / \ / \ B C D ==> B A| OR A| D / \ / \ C D B C 2. Parent Annotation In addition to binarizing the tree, there are two standard modifications to node labels we can do in the same traversal: parent annotation and Markov order-N smoothing (or sibling smoothing). The purpose of parent annotation is to refine the probabilities of productions by adding a small amount of context. With this simple addition, a CYK (inside-outside, dynamic programming chart parse) can improve from 74% to 79% accuracy. A natural generalization from parent annotation is to grandparent annotation and beyond. The tradeoff becomes accuracy gain vs. computational complexity. We must also keep in mind data sparcity issues. Example:: Original Parent Annotation A A^ / | \ / \ B C D ==> B^
    A|^ where ? is the / \ parent of A C^ D^ 3. Markov order-N smoothing Markov smoothing combats data sparcity issues as well as decreasing computational requirements by limiting the number of children included in artificial nodes. In practice, most people use an order 2 grammar. Example:: Original No Smoothing Markov order 1 Markov order 2 etc. __A__ A A A / /|\ \ / \ / \ / \ B C D E F ==> B A| ==> B A| ==> B A| / \ / \ / \ C ... C ... C ... Annotation decisions can be thought about in the vertical direction (parent, grandparent, etc) and the horizontal direction (number of siblings to keep). Parameters to the following functions specify these values. For more information see: Dan Klein and Chris Manning (2003) "Accurate Unlexicalized Parsing", ACL-03. http://www.aclweb.org/anthology/P03-1054 4. Unary Collapsing Collapse unary productions (ie. subtrees with a single child) into a new non-terminal (Tree node). This is useful when working with algorithms that do not allow unary productions, yet you do not wish to lose the parent information. Example:: A | B ==> A+B / \ / \ C D C D """ from __future__ import print_function from nltk.tree import Tree def chomsky_normal_form(tree, factor="right", horzMarkov=None, vertMarkov=0, childChar="|", parentChar="^"): # assume all subtrees have homogeneous children # assume all terminals have no siblings # A semi-hack to have elegant looking code below. As a result, # any subtree with a branching factor greater than 999 will be incorrectly truncated. if horzMarkov is None: horzMarkov = 999 # Traverse the tree depth-first keeping a list of ancestor nodes to the root. # I chose not to use the tree.treepositions() method since it requires # two traversals of the tree (one to get the positions, one to iterate # over them) and node access time is proportional to the height of the node. # This method is 7x faster which helps when parsing 40,000 sentences. nodeList = [(tree, [tree.label()])] while nodeList != []: node, parent = nodeList.pop() if isinstance(node,Tree): # parent annotation parentString = "" originalNode = node.label() if vertMarkov != 0 and node != tree and isinstance(node[0],Tree): parentString = "%s<%s>" % (parentChar, "-".join(parent)) node.set_label(node.label() + parentString) parent = [originalNode] + parent[:vertMarkov - 1] # add children to the agenda before we mess with them for child in node: nodeList.append((child, parent)) # chomsky normal form factorization if len(node) > 2: childNodes = [child.label() for child in node] nodeCopy = node.copy() node[0:] = [] # delete the children curNode = node numChildren = len(nodeCopy) for i in range(1,numChildren - 1): if factor == "right": newHead = "%s%s<%s>%s" % (originalNode, childChar, "-".join(childNodes[i:min([i+horzMarkov,numChildren])]),parentString) # create new head newNode = Tree(newHead, []) curNode[0:] = [nodeCopy.pop(0), newNode] else: newHead = "%s%s<%s>%s" % (originalNode, childChar, "-".join(childNodes[max([numChildren-i-horzMarkov,0]):-i]),parentString) newNode = Tree(newHead, []) curNode[0:] = [newNode, nodeCopy.pop()] curNode = newNode curNode[0:] = [child for child in nodeCopy] def un_chomsky_normal_form(tree, expandUnary = True, childChar = "|", parentChar = "^", unaryChar = "+"): # Traverse the tree-depth first keeping a pointer to the parent for modification purposes. nodeList = [(tree,[])] while nodeList != []: node,parent = nodeList.pop() if isinstance(node,Tree): # if the node contains the 'childChar' character it means that # it is an artificial node and can be removed, although we still need # to move its children to its parent childIndex = node.label().find(childChar) if childIndex != -1: nodeIndex = parent.index(node) parent.remove(parent[nodeIndex]) # Generated node was on the left if the nodeIndex is 0 which # means the grammar was left factored. We must insert the children # at the beginning of the parent's children if nodeIndex == 0: parent.insert(0,node[0]) parent.insert(1,node[1]) else: parent.extend([node[0],node[1]]) # parent is now the current node so the children of parent will be added to the agenda node = parent else: parentIndex = node.label().find(parentChar) if parentIndex != -1: # strip the node name of the parent annotation node.set_label(node.label()[:parentIndex]) # expand collapsed unary productions if expandUnary == True: unaryIndex = node.label().find(unaryChar) if unaryIndex != -1: newNode = Tree(node.label()[unaryIndex + 1:], [i for i in node]) node.set_label(node.label()[:unaryIndex]) node[0:] = [newNode] for child in node: nodeList.append((child,node)) def collapse_unary(tree, collapsePOS = False, collapseRoot = False, joinChar = "+"): """ Collapse subtrees with a single child (ie. unary productions) into a new non-terminal (Tree node) joined by 'joinChar'. This is useful when working with algorithms that do not allow unary productions, and completely removing the unary productions would require loss of useful information. The Tree is modified directly (since it is passed by reference) and no value is returned. :param tree: The Tree to be collapsed :type tree: Tree :param collapsePOS: 'False' (default) will not collapse the parent of leaf nodes (ie. Part-of-Speech tags) since they are always unary productions :type collapsePOS: bool :param collapseRoot: 'False' (default) will not modify the root production if it is unary. For the Penn WSJ treebank corpus, this corresponds to the TOP -> productions. :type collapseRoot: bool :param joinChar: A string used to connect collapsed node values (default = "+") :type joinChar: str """ if collapseRoot == False and isinstance(tree, Tree) and len(tree) == 1: nodeList = [tree[0]] else: nodeList = [tree] # depth-first traversal of tree while nodeList != []: node = nodeList.pop() if isinstance(node,Tree): if len(node) == 1 and isinstance(node[0], Tree) and (collapsePOS == True or isinstance(node[0,0], Tree)): node.set_label(node.label() + joinChar + node[0].label()) node[0:] = [child for child in node[0]] # since we assigned the child's children to the current node, # evaluate the current node again nodeList.append(node) else: for child in node: nodeList.append(child) ################################################################# # Demonstration ################################################################# def demo(): """ A demonstration showing how each tree transform can be used. """ from nltk.draw.tree import draw_trees from nltk import tree, treetransforms from copy import deepcopy # original tree from WSJ bracketed text sentence = """(TOP (S (S (VP (VBN Turned) (ADVP (RB loose)) (PP (IN in) (NP (NP (NNP Shane) (NNP Longman) (POS 's)) (NN trading) (NN room))))) (, ,) (NP (DT the) (NN yuppie) (NNS dealers)) (VP (AUX do) (NP (NP (RB little)) (ADJP (RB right)))) (. .)))""" t = tree.Tree.fromstring(sentence, remove_empty_top_bracketing=True) # collapse subtrees with only one child collapsedTree = deepcopy(t) treetransforms.collapse_unary(collapsedTree) # convert the tree to CNF cnfTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(cnfTree) # convert the tree to CNF with parent annotation (one level) and horizontal smoothing of order two parentTree = deepcopy(collapsedTree) treetransforms.chomsky_normal_form(parentTree, horzMarkov=2, vertMarkov=1) # convert the tree back to its original form (used to make CYK results comparable) original = deepcopy(parentTree) treetransforms.un_chomsky_normal_form(original) # convert tree back to bracketed text sentence2 = original.pprint() print(sentence) print(sentence2) print("Sentences the same? ", sentence == sentence2) draw_trees(t, collapsedTree, cnfTree, parentTree, original) if __name__ == '__main__': demo() __all__ = ["chomsky_normal_form", "un_chomsky_normal_form", "collapse_unary"] nltk-3.1/nltk/twitter/0000755000076500000240000000000012610001541014514 5ustar sbstaff00000000000000nltk-3.1/nltk/twitter/__init__.py0000644000076500000240000000136712607224144016650 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Twitter # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # URL: # For license information, see LICENSE.TXT """ NLTK Twitter Package This package contains classes for retrieving Tweet documents using the Twitter API. """ try: import twython except ImportError: import warnings warnings.warn("The twython library has not been installed. " "Some functionality from the twitter package will not be available.") else: from nltk.twitter.util import Authenticate, credsfromfile from nltk.twitter.twitterclient import Streamer, Query, Twitter,\ TweetViewer, TweetWriter from nltk.twitter.common import json2csv nltk-3.1/nltk/twitter/api.py0000644000076500000240000001060212607224144015652 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Twitter API # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ This module provides an interface for TweetHandlers, and support for timezone handling. """ from datetime import tzinfo, timedelta, datetime from nltk.compat import UTC import time as _time class LocalTimezoneOffsetWithUTC(tzinfo): """ This is not intended to be a general purpose class for dealing with the local timezone. In particular: * it assumes that the date passed has been created using `datetime(..., tzinfo=Local)`, where `Local` is an instance of the object `LocalTimezoneOffsetWithUTC`; * for such an object, it returns the offset with UTC, used for date comparisons. Reference: https://docs.python.org/3/library/datetime.html """ STDOFFSET = timedelta(seconds=-_time.timezone) if _time.daylight: DSTOFFSET = timedelta(seconds=-_time.altzone) else: DSTOFFSET = STDOFFSET def utcoffset(self, dt): """ Access the relevant time offset. """ return self.DSTOFFSET LOCAL = LocalTimezoneOffsetWithUTC() class BasicTweetHandler(object): """ Minimal implementation of `TweetHandler`. Counts the number of Tweets and decides when the client should stop fetching them. """ def __init__(self, limit=20): self.limit = limit self.counter = 0 """ A flag to indicate to the client whether to stop fetching data given some condition (e.g., reaching a date limit). """ self.do_stop = False """ Stores the id of the last fetched Tweet to handle pagination. """ self.max_id = None def do_continue(self): """ Returns `False` if the client should stop fetching Tweets. """ return self.counter < self.limit and not self.do_stop class TweetHandlerI(BasicTweetHandler): """ Interface class whose subclasses should implement a handle method that Twitter clients can delegate to. """ def __init__(self, limit=20, upper_date_limit=None, lower_date_limit=None): """ :param int limit: The number of data items to process in the current\ round of processing. :param tuple upper_date_limit: The date at which to stop collecting\ new data. This should be entered as a tuple which can serve as the\ argument to `datetime.datetime`.\ E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. :param tuple lower_date_limit: The date at which to stop collecting\ new data. See `upper_data_limit` for formatting. """ BasicTweetHandler.__init__(self, limit) self.upper_date_limit = None self.lower_date_limit = None if upper_date_limit: self.upper_date_limit = datetime(*upper_date_limit, tzinfo=LOCAL) if lower_date_limit: self.lower_date_limit = datetime(*lower_date_limit, tzinfo=LOCAL) self.startingup = True def handle(self, data): """ Deal appropriately with data returned by the Twitter API """ raise NotImplementedError def on_finish(self): """ Actions when the tweet limit has been reached """ raise NotImplementedError def check_date_limit(self, data, verbose=False): """ Validate date limits. """ if self.upper_date_limit or self.lower_date_limit: date_fmt = '%a %b %d %H:%M:%S +0000 %Y' tweet_date = \ datetime.strptime(data['created_at'], date_fmt).replace(tzinfo=UTC) if (self.upper_date_limit and tweet_date > self.upper_date_limit) or \ (self.lower_date_limit and tweet_date < self.lower_date_limit): if self.upper_date_limit: message = "earlier" date_limit = self.upper_date_limit else: message = "later" date_limit = self.lower_date_limit if verbose: print("Date limit {0} is {1} than date of current tweet {2}".\ format(date_limit, message, tweet_date)) self.do_stop = True nltk-3.1/nltk/twitter/common.py0000644000076500000240000002312412607224144016374 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ Utility functions for the :module:`twitterclient` module which do not require the `twython` library to have been installed. """ from __future__ import print_function import csv import gzip import json import nltk.compat as compat HIER_SEPARATOR = "." def extract_fields(tweet, fields): """ Extract field values from a full tweet and return them as a list :param json tweet: The tweet in JSON format :param list fields: The fields to be extracted from the tweet :rtype: list(str) """ out = [] for field in fields: try: _add_field_to_out(tweet, field, out) except TypeError: raise RuntimeError('Fatal error when extracting fields. Cannot find field ', field) return out def _add_field_to_out(json, field, out): if _is_composed_key(field): key, value = _get_key_value_composed(field) _add_field_to_out(json[key], value, out) else: out += [json[field]] def _is_composed_key(field): if HIER_SEPARATOR in field: return True return False def _get_key_value_composed(field): out = field.split(HIER_SEPARATOR) # there could be up to 3 levels key = out[0] value = HIER_SEPARATOR.join(out[1:]) return key, value def _get_entity_recursive(json, entity): if not json: return None elif isinstance(json, dict): for key, value in json.items(): if key == entity: return value # 'entities' and 'extended_entities' are wrappers in Twitter json # structure that contain other Twitter objects. See: # https://dev.twitter.com/overview/api/entities-in-twitter-objects if key == 'entities' or key == 'extended_entities': candidate = _get_entity_recursive(value, entity) if candidate is not None: return candidate return None elif isinstance(json, list): for item in json: candidate = _get_entity_recursive(item, entity) if candidate is not None: return candidate return None else: return None def json2csv(fp, outfile, fields, encoding='utf8', errors='replace', gzip_compress=False): """ Extract selected fields from a file of line-separated JSON tweets and write to a file in CSV format. This utility function allows a file of full tweets to be easily converted to a CSV file for easier processing. For example, just TweetIDs or just the text content of the Tweets can be extracted. Additionally, the function allows combinations of fields of other Twitter objects (mainly the users, see below). For Twitter entities (e.g. hashtags of a Tweet), and for geolocation, see `json2csv_entities` :param str infile: The name of the file containing full tweets :param str outfile: The name of the text file where results should be\ written :param list fields: The list of fields to be extracted. Useful examples\ are 'id_str' for the tweetID and 'text' for the text of the tweet. See\ for a full list of fields.\ e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count']\ Additonally, it allows IDs from other Twitter objects, e. g.,\ ['id', 'text', 'user.id', 'user.followers_count', 'user.friends_count'] :param error: Behaviour for encoding errors, see\ https://docs.python.org/3/library/codecs.html#codec-base-classes :param gzip_compress: if `True`, output files are compressed with gzip """ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress) # write the list of fields as header writer.writerow(fields) # process the file for line in fp: tweet = json.loads(line) row = extract_fields(tweet, fields) writer.writerow(row) outf.close() def outf_writer_compat(outfile, encoding, errors, gzip_compress=False): """ Identify appropriate CSV writer given the Python version """ if compat.PY3: if gzip_compress: outf = gzip.open(outfile, 'wt', encoding=encoding, errors=errors) else: outf = open(outfile, 'w', encoding=encoding, errors=errors) writer = csv.writer(outf) else: if gzip_compress: outf = gzip.open(outfile, 'wb') else: outf = open(outfile, 'wb') writer = compat.UnicodeWriter(outf, encoding=encoding, errors=errors) return (writer, outf) def json2csv_entities(tweets_file, outfile, main_fields, entity_type, entity_fields, encoding='utf8', errors='replace', gzip_compress=False): """ Extract selected fields from a file of line-separated JSON tweets and write to a file in CSV format. This utility function allows a file of full Tweets to be easily converted to a CSV file for easier processing of Twitter entities. For example, the hashtags or media elements of a tweet can be extracted. It returns one line per entity of a Tweet, e.g. if a tweet has two hashtags there will be two lines in the output file, one per hashtag :param tweets_file: the file-like object containing full Tweets :param str outfile: The path of the text file where results should be\ written :param list main_fields: The list of fields to be extracted from the main\ object, usually the tweet. Useful examples: 'id_str' for the tweetID. See\ for a full list of fields. e. g.: ['id_str'], ['id', 'text', 'favorite_count', 'retweet_count'] If `entity_type` is expressed with hierarchy, then it is the list of\ fields of the object that corresponds to the key of the entity_type,\ (e.g., for entity_type='user.urls', the fields in the main_fields list\ belong to the user object; for entity_type='place.bounding_box', the\ files in the main_field list belong to the place object of the tweet). :param list entity_type: The name of the entity: 'hashtags', 'media',\ 'urls' and 'user_mentions' for the tweet object. For a user object,\ this needs to be expressed with a hierarchy: `'user.urls'`. For the\ bounding box of the Tweet location, use `'place.bounding_box'`. :param list entity_fields: The list of fields to be extracted from the\ entity. E.g. `['text']` (of the Tweet) :param error: Behaviour for encoding errors, see\ https://docs.python.org/3/library/codecs.html#codec-base-classes :param gzip_compress: if `True`, ouput files are compressed with gzip """ (writer, outf) = outf_writer_compat(outfile, encoding, errors, gzip_compress) header = get_header_field_list(main_fields, entity_type, entity_fields) writer.writerow(header) for line in tweets_file: tweet = json.loads(line) if _is_composed_key(entity_type): key, value = _get_key_value_composed(entity_type) object_json = _get_entity_recursive(tweet, key) if not object_json: # this can happen in the case of "place" continue object_fields = extract_fields(object_json, main_fields) items = _get_entity_recursive(object_json, value) _write_to_file(object_fields, items, entity_fields, writer) else: tweet_fields = extract_fields(tweet, main_fields) items = _get_entity_recursive(tweet, entity_type) _write_to_file(tweet_fields, items, entity_fields, writer) outf.close() def get_header_field_list(main_fields, entity_type, entity_fields): if _is_composed_key(entity_type): key, value = _get_key_value_composed(entity_type) main_entity = key sub_entity = value else: main_entity = None sub_entity = entity_type if main_entity: output1 = [HIER_SEPARATOR.join([main_entity, x]) for x in main_fields] else: output1 = main_fields output2 = [HIER_SEPARATOR.join([sub_entity, x]) for x in entity_fields] return output1 + output2 def _write_to_file(object_fields, items, entity_fields, writer): if not items: # it could be that the entity is just not present for the tweet # e.g. tweet hashtag is always present, even as [], however # tweet media may not be present return if isinstance(items, dict): # this happens e.g. for "place" of a tweet row = object_fields # there might be composed keys in de list of required fields entity_field_values = [x for x in entity_fields if not _is_composed_key(x)] entity_field_composed = [x for x in entity_fields if _is_composed_key(x)] for field in entity_field_values: value = items[field] if isinstance(value, list): row += value else: row += [value] # now check required dictionaries for d in entity_field_composed: kd, vd = _get_key_value_composed(d) json_dict = items[kd] if not isinstance(json_dict, dict): raise RuntimeError("""Key {0} does not contain a dictionary in the json file""".format(kd)) row += [json_dict[vd]] writer.writerow(row) return # in general it is a list for item in items: row = object_fields + extract_fields(item, entity_fields) writer.writerow(row) nltk-3.1/nltk/twitter/twitter_demo.py0000644000076500000240000001756512607224144017626 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ Examples to demo the :py:mod:`twitterclient` code. These demo functions should all run, with the following caveats: * You must have obtained API keys from Twitter, and installed them according to the instructions in the `twitter HOWTO `_. * If you are on a slow network, some of the calls to the Twitter API may timeout. * If you are being rate limited while searching, you will receive a 420 error response. * Your terminal window / console must be able to display UTF-8 encoded characters. For documentation about the Twitter APIs, see `The Streaming APIs Overview `_ and `The REST APIs Overview `_. For error codes see Twitter's `Error Codes and Responses ` """ from __future__ import print_function import datetime from functools import wraps import json from nltk.compat import StringIO from nltk.twitter import Query, Streamer, Twitter, TweetViewer, TweetWriter,\ credsfromfile SPACER = '###################################' def verbose(func): """Decorator for demo functions""" @wraps(func) def with_formatting(*args, **kwargs): print() print(SPACER) print("Using %s" % (func.__name__)) print(SPACER) return func(*args, **kwargs) return with_formatting def yesterday(): """ Get yesterday's datetime as a 5-tuple. """ date = datetime.datetime.now() date -= datetime.timedelta(days=1) date_tuple = date.timetuple()[:6] return date_tuple def setup(): """ Initialize global variables for the demos. """ global USERIDS, FIELDS USERIDS = ['759251', '612473', '15108702', '6017542', '2673523800'] # UserIDs corresponding to\ # @CNN, @BBCNews, @ReutersLive, @BreakingNews, @AJELive FIELDS = ['id_str'] @verbose def twitterclass_demo(): """ Use the simplified :class:`Twitter` class to write some tweets to a file. """ tw = Twitter() print("Track from the public stream\n") tw.tweets(keywords='love, hate', limit=10) #public stream print(SPACER) print("Search past Tweets\n") tw = Twitter() tw.tweets(keywords='love, hate', stream=False, limit=10) # search past tweets print(SPACER) print("Follow two accounts in the public stream" + " -- be prepared to wait a few minutes\n") tw = Twitter() tw.tweets(follow=['759251', '6017542'], stream=True, limit=5) #public stream @verbose def sampletoscreen_demo(limit=20): """ Sample from the Streaming API and send output to terminal. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=limit)) client.sample() @verbose def tracktoscreen_demo(track="taylor swift", limit=10): """ Track keywords from the public Streaming API and send output to terminal. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=limit)) client.filter(track=track) @verbose def search_demo(keywords='nltk'): """ Use the REST API to search for past tweets containing a given keyword. """ oauth = credsfromfile() client = Query(**oauth) for tweet in client.search_tweets(keywords=keywords, limit=10): print(tweet['text']) @verbose def tweets_by_user_demo(user='NLTK_org', count=200): """ Use the REST API to search for past tweets by a given user. """ oauth = credsfromfile() client = Query(**oauth) client.register(TweetWriter()) client.user_tweets(user, count) @verbose def lookup_by_userid_demo(): """ Use the REST API to convert a userID to a screen name. """ oauth = credsfromfile() client = Query(**oauth) user_info = client.user_info_from_id(USERIDS) for info in user_info: name = info['screen_name'] followers = info['followers_count'] following = info['friends_count'] print("{0}, followers: {1}, following: {2}".format(name, followers, following)) @verbose def followtoscreen_demo(limit=10): """ Using the Streaming API, select just the tweets from a specified list of userIDs. This is will only give results in a reasonable time if the users in question produce a high volume of tweets, and may even so show some delay. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetViewer(limit=limit)) client.statuses.filter(follow=USERIDS) @verbose def streamtofile_demo(limit=20): """ Write 20 tweets sampled from the public Streaming API to a file. """ oauth = credsfromfile() client = Streamer(**oauth) client.register(TweetWriter(limit=limit, repeat=False)) client.statuses.sample() @verbose def limit_by_time_demo(keywords="nltk"): """ Query the REST API for Tweets about NLTK since yesterday and send the output to terminal. This example makes the assumption that there are sufficient Tweets since yesterday for the date to be an effective cut-off. """ date = yesterday() dt_date = datetime.datetime(*date) oauth = credsfromfile() client = Query(**oauth) client.register(TweetViewer(limit=100, lower_date_limit=date)) print("Cutoff date: {}\n".format(dt_date)) for tweet in client.search_tweets(keywords=keywords): print("{} ".format(tweet['created_at']), end='') client.handler.handle(tweet) @verbose def corpusreader_demo(): """ Use :module:`TwitterCorpusReader` tp read a file of tweets, and print out * some full tweets in JSON format; * some raw strings from the tweets (i.e., the value of the `text` field); and * the result of tokenising the raw strings. """ from nltk.corpus import twitter_samples as tweets print() print("Complete tweet documents") print(SPACER) for tweet in tweets.docs("tweets.20150430-223406.json")[:1]: print(json.dumps(tweet, indent=1, sort_keys=True)) print() print("Raw tweet strings:") print(SPACER) for text in tweets.strings("tweets.20150430-223406.json")[:15]: print(text) print() print("Tokenized tweet strings:") print(SPACER) for toks in tweets.tokenized("tweets.20150430-223406.json")[:15]: print(toks) @verbose def expand_tweetids_demo(): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets, if available. """ ids_f =\ StringIO("""\ 588665495492124672 588665495487909888 588665495508766721 588665495513006080 588665495517200384 588665495487811584 588665495525588992 588665495487844352 588665495492014081 588665495512948737""") oauth = credsfromfile() client = Query(**oauth) hydrated = client.expand_tweetids(ids_f) for tweet in hydrated: id_str = tweet['id_str'] print('id: {}'.format(id_str)) text = tweet['text'] if text.startswith('@null'): text = "[Tweet not available]" print(text + '\n') ALL = [twitterclass_demo, sampletoscreen_demo, tracktoscreen_demo, search_demo, tweets_by_user_demo, lookup_by_userid_demo, followtoscreen_demo, streamtofile_demo, limit_by_time_demo, corpusreader_demo, expand_tweetids_demo] """ Select demo functions to run. E.g. replace the following line with "DEMOS = ALL[8:]" to execute only the final three demos. """ DEMOS = ALL[:] if __name__ == "__main__": setup() for demo in DEMOS: demo() print("\n" + SPACER) print("All demos completed") print(SPACER) nltk-3.1/nltk/twitter/twitterclient.py0000644000076500000240000004467112607224144020017 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ NLTK Twitter client This module offers methods for collecting and processing Tweets. Most of the functionality depends on access to the Twitter APIs, and this is handled via the third party Twython library. If one of the methods below returns an integer, it is probably a `Twitter error code `_. For example, the response of '420' means that you have reached the limit of the requests you can currently make to the Twitter API. Currently, `rate limits for the search API `_ are divided into 15 minute windows. """ import datetime import itertools import json import os import requests import time import gzip from twython import Twython, TwythonStreamer from twython.exceptions import TwythonRateLimitError, TwythonError from nltk.twitter.util import credsfromfile, guess_path from nltk.twitter.api import TweetHandlerI, BasicTweetHandler class Streamer(TwythonStreamer): """ Retrieve data from the Twitter Streaming API. The streaming API requires `OAuth 1.0 `_ authentication. """ def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret): self.handler = None self.do_continue = True TwythonStreamer.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret) def register(self, handler): """ Register a method for handling Tweets. :param TweetHandlerI handler: method for viewing """ self.handler = handler def on_success(self, data): """ :param data: response from Twitter API """ if self.do_continue: if self.handler is not None: if 'text' in data: self.handler.counter += 1 self.handler.handle(data) self.do_continue = self.handler.do_continue() else: raise ValueError("No data handler has been registered.") else: self.disconnect() self.handler.on_finish() def on_error(self, status_code, data): """ :param status_code: The status code returned by the Twitter API :param data: The response from Twitter API """ print(status_code) def sample(self): """ Wrapper for 'statuses / sample' API call """ while self.do_continue: # Stream in an endless loop until limit is reached. See twython # issue 288: https://github.com/ryanmcgrath/twython/issues/288 # colditzjb commented on 9 Dec 2014 try: self.statuses.sample() except requests.exceptions.ChunkedEncodingError as e: if e is not None: print("Error (stream will continue): {0}".format(e)) continue def filter(self, track='', follow='', lang='en'): """ Wrapper for 'statuses / filter' API call """ while self.do_continue: #Stream in an endless loop until limit is reached try: if track == '' and follow == '': msg = "Please supply a value for 'track', 'follow'" raise ValueError(msg) self.statuses.filter(track=track, follow=follow, lang=lang) except requests.exceptions.ChunkedEncodingError as e: if e is not None: print("Error (stream will continue): {0}".format(e)) continue class Query(Twython): """ Retrieve data from the Twitter REST API. """ def __init__(self, app_key, app_secret, oauth_token, oauth_token_secret): self.handler = None self.do_continue = True Twython.__init__(self, app_key, app_secret, oauth_token, oauth_token_secret) def register(self, handler): """ Register a method for handling Tweets. :param TweetHandlerI handler: method for viewing or writing Tweets to a file. """ self.handler = handler def expand_tweetids(self, ids_f, verbose=True): """ Given a file object containing a list of Tweet IDs, fetch the corresponding full Tweets from the Twitter API. The API call `statuses/lookup` will fail to retrieve a Tweet if the user has deleted it. This call to the Twitter API is rate-limited. See for details. :param ids_f: input file object consisting of Tweet IDs, one to a line :return: iterable of Tweet objects in JSON format """ ids = [line.strip() for line in ids_f if line] if verbose: print("Counted {0} Tweet IDs in {1}.".format(len(ids), ids_f)) # The Twitter endpoint takes lists of up to 100 ids, so we chunk the # ids. id_chunks = [ids[i:i+100] for i in range(0, len(ids), 100)] chunked_tweets = (self.lookup_status(id=chunk) for chunk in id_chunks) return itertools.chain.from_iterable(chunked_tweets) def _search_tweets(self, keywords, limit=100, lang='en'): """ Assumes that the handler has been informed. Fetches Tweets from search_tweets generator output and passses them to handler :param str keywords: A list of query terms to search for, written as\ a comma-separated string. :param int limit: Number of Tweets to process :param str lang: language """ while True: tweets = self.search_tweets(keywords=keywords, limit=limit, lang=lang, max_id=self.handler.max_id) for tweet in tweets: self.handler.handle(tweet) if not (self.handler.do_continue() and self.handler.repeat): break self.handler.on_finish() def search_tweets(self, keywords, limit=100, lang='en', max_id=None, retries_after_twython_exception=0): """ Call the REST API ``'search/tweets'`` endpoint with some plausible defaults. See `the Twitter search documentation `_ for more information about admissable search parameters. :param str keywords: A list of query terms to search for, written as\ a comma-separated string :param int limit: Number of Tweets to process :param str lang: language :param int max_id: id of the last tweet fetched :param int retries_after_twython_exception: number of retries when\ searching Tweets before raising an exception :rtype: python generator """ if not self.handler: # if no handler is provided, `BasicTweetHandler` provides minimum # functionality for limiting the number of Tweets retrieved self.handler = BasicTweetHandler(limit=limit) count_from_query = 0 if max_id: self.handler.max_id = max_id else: results = self.search(q=keywords, count=min(100, limit), lang=lang, result_type='recent') count = len(results['statuses']) if count == 0: print("No Tweets available through REST API for those keywords") return count_from_query = count self.handler.max_id = results['statuses'][count - 1]['id'] - 1 for result in results['statuses']: yield result self.handler.counter += 1 if self.handler.do_continue() == False: return # Pagination loop: keep fetching Tweets until the desired count is # reached while dealing with Twitter rate limits. retries = 0 while count_from_query < limit: try: mcount = min(100, limit-count_from_query) results = self.search(q=keywords, count=mcount, lang=lang, max_id=self.handler.max_id, result_type='recent') except TwythonRateLimitError as e: print("Waiting for 15 minutes -{0}".format(e)) time.sleep(15*60) # wait 15 minutes continue except TwythonError as e: print("Fatal error in Twython request -{0}".format(e)) if retries_after_twython_exception == retries: raise e retries += 1 count = len(results['statuses']) if count == 0: print("No more Tweets available through rest api") return count_from_query += count # the max_id is also present in the Tweet metadata # results['search_metadata']['next_results'], but as part of a # query and difficult to fetch. This is doing the equivalent # (last tweet id minus one) self.handler.max_id = results['statuses'][count - 1]['id'] - 1 for result in results['statuses']: yield result self.handler.counter += 1 if self.handler.do_continue() == False: return def user_info_from_id(self, userids): """ Convert a list of userIDs into a variety of information about the users. See . :param list userids: A list of integer strings corresponding to Twitter userIDs :rtype: list(json) """ return [self.show_user(user_id=userid) for userid in userids] def user_tweets(self, screen_name, limit, include_rts='false'): """ Return a collection of the most recent Tweets posted by the user :param str user: The user's screen name; the initial '@' symbol\ should be omitted :param int limit: The number of Tweets to recover; 200 is the maximum allowed :param str include_rts: Whether to include statuses which have been\ retweeted by the user; possible values are 'true' and 'false' """ data = self.get_user_timeline(screen_name=screen_name, count=limit, include_rts=include_rts) for item in data: self.handler.handle(item) class Twitter(object): """ Wrapper class with restricted functionality and fewer options. """ def __init__(self): self._oauth = credsfromfile() self.streamer = Streamer(**self._oauth) self.query = Query(**self._oauth) def tweets(self, keywords='', follow='', to_screen=True, stream=True, limit=100, date_limit=None, lang='en', repeat=False, gzip_compress=False): """ Process some Tweets in a simple manner. :param str keywords: Keywords to use for searching or filtering :param list follow: UserIDs to use for filtering Tweets from the public stream :param bool to_screen: If `True`, display the tweet texts on the screen,\ otherwise print to a file :param bool stream: If `True`, use the live public stream,\ otherwise search past public Tweets :param int limit: The number of data items to process in the current\ round of processing. :param tuple date_limit: The date at which to stop collecting\ new data. This should be entered as a tuple which can serve as the\ argument to `datetime.datetime`.\ E.g. `date_limit=(2015, 4, 1, 12, 40)` for 12:30 pm on April 1 2015. Note that, in the case of streaming, this is the maximum date, i.e.\ a date in the future; if not, it is the minimum date, i.e. a date\ in the past :param str lang: language :param bool repeat: A flag to determine whether multiple files should\ be written. If `True`, the length of each file will be set by the\ value of `limit`. Use only if `to_screen` is `False`. See also :py:func:`handle`. :param gzip_compress: if `True`, output files are compressed with gzip. """ if stream: upper_date_limit = date_limit lower_date_limit = None else: upper_date_limit = None lower_date_limit = date_limit if to_screen: handler = TweetViewer(limit=limit, upper_date_limit=upper_date_limit, lower_date_limit=lower_date_limit) else: handler = TweetWriter(limit=limit, upper_date_limit=upper_date_limit, lower_date_limit=lower_date_limit, repeat=repeat, gzip_compress=gzip_compress) if to_screen: handler = TweetViewer(limit=limit) else: if stream: upper_date_limit = date_limit lower_date_limit = None else: upper_date_limit = None lower_date_limit = date_limit handler = TweetWriter(limit=limit, upper_date_limit=upper_date_limit, lower_date_limit=lower_date_limit, repeat=repeat, gzip_compress=gzip_compress) if stream: self.streamer.register(handler) if keywords == '' and follow == '': self.streamer.sample() else: self.streamer.filter(track=keywords, follow=follow, lang=lang) else: self.query.register(handler) if keywords == '': raise ValueError("Please supply at least one keyword to search for.") else: self.query._search_tweets(keywords, limit=limit, lang=lang) class TweetViewer(TweetHandlerI): """ Handle data by sending it to the terminal. """ def handle(self, data): """ Direct data to `sys.stdout` :return: return ``False`` if processing should cease, otherwise return ``True``. :rtype: bool :param data: Tweet object returned by Twitter API """ text = data['text'] print(text) self.check_date_limit(data) if self.do_stop: return def on_finish(self): print('Written {0} Tweets'.format(self.counter)) class TweetWriter(TweetHandlerI): """ Handle data by writing it to a file. """ def __init__(self, limit=2000, upper_date_limit=None, lower_date_limit=None, fprefix='tweets', subdir='twitter-files', repeat=False, gzip_compress=False): """ The difference between the upper and lower date limits depends on whether Tweets are coming in an ascending date order (i.e. when streaming) or descending date order (i.e. when searching past Tweets). :param int limit: number of data items to process in the current\ round of processing. :param tuple upper_date_limit: The date at which to stop collecting new\ data. This should be entered as a tuple which can serve as the\ argument to `datetime.datetime`. E.g. `upper_date_limit=(2015, 4, 1, 12,\ 40)` for 12:30 pm on April 1 2015. :param tuple lower_date_limit: The date at which to stop collecting new\ data. See `upper_data_limit` for formatting. :param str fprefix: The prefix to use in creating file names for Tweet\ collections. :param str subdir: The name of the directory where Tweet collection\ files should be stored. :param bool repeat: flag to determine whether multiple files should be\ written. If `True`, the length of each file will be set by the value\ of `limit`. See also :py:func:`handle`. :param gzip_compress: if `True`, ouput files are compressed with gzip. """ self.fprefix = fprefix self.subdir = guess_path(subdir) self.gzip_compress = gzip_compress self.fname = self.timestamped_file() self.repeat = repeat self.output = None TweetHandlerI.__init__(self, limit, upper_date_limit, lower_date_limit) def timestamped_file(self): """ :return: timestamped file name :rtype: str """ subdir = self.subdir fprefix = self.fprefix if subdir: if not os.path.exists(subdir): os.mkdir(subdir) fname = os.path.join(subdir, fprefix) fmt = '%Y%m%d-%H%M%S' timestamp = datetime.datetime.now().strftime(fmt) if self.gzip_compress: suffix = '.gz' else: suffix = '' outfile = '{0}.{1}.json{2}'.format(fname, timestamp, suffix) return outfile def handle(self, data): """ Write Twitter data as line-delimited JSON into one or more files. :return: return `False` if processing should cease, otherwise return `True`. :param data: tweet object returned by Twitter API """ if self.startingup: if self.gzip_compress: self.output = gzip.open(self.fname, 'w') else: self.output = open(self.fname, 'w') print('Writing to {0}'.format(self.fname)) json_data = json.dumps(data) if self.gzip_compress: self.output.write((json_data + "\n").encode('utf-8')) else: self.output.write(json_data + "\n") self.check_date_limit(data) if self.do_stop: return self.startingup = False def on_finish(self): print('Written {0} Tweets'.format(self.counter)) if self.output: self.output.close() def do_continue(self): if self.repeat == False: return TweetHandlerI.do_continue(self) if self.do_stop: # stop for a functional cause (e.g. date limit) return False if self.counter == self.limit: # repeat is True, thus close output file and # create a new one self._restart_file() return True def _restart_file(self): self.on_finish() self.fname = self.timestamped_file() self.startingup = True self.counter = 0 nltk-3.1/nltk/twitter/util.py0000644000076500000240000001067512607224144016070 0ustar sbstaff00000000000000# -*- coding: utf-8 -*- # Natural Language Toolkit: Twitter client # # Copyright (C) 2001-2015 NLTK Project # Author: Ewan Klein # Lorenzo Rubio # URL: # For license information, see LICENSE.TXT """ Authentication utilities to accompany :module:`twitterclient`. """ from __future__ import print_function import os import pprint from twython import Twython def credsfromfile(creds_file=None, subdir=None, verbose=False): """ Convenience function for authentication """ return Authenticate().load_creds(creds_file=creds_file, subdir=subdir, verbose=verbose) class Authenticate(object): """ Methods for authenticating with Twitter. """ def __init__(self): self.creds_file = 'credentials.txt' self.creds_fullpath = None self.oauth = {} try: self.twitter_dir = os.environ['TWITTER'] self.creds_subdir = self.twitter_dir except KeyError: self.twitter_dir = None self.creds_subdir = None def load_creds(self, creds_file=None, subdir=None, verbose=False): """ Read OAuth credentials from a text file. :: File format for OAuth 1 ======================= app_key=YOUR_APP_KEY app_secret=YOUR_APP_SECRET oauth_token=OAUTH_TOKEN oauth_token_secret=OAUTH_TOKEN_SECRET :: File format for OAuth 2 ======================= app_key=YOUR_APP_KEY app_secret=YOUR_APP_SECRET access_token=ACCESS_TOKEN :param str file_name: File containing credentials. ``None`` (default) reads\ data from `TWITTER/'credentials.txt'` """ if creds_file is not None: self.creds_file = creds_file if subdir is None: if self.creds_subdir is None: msg = "Supply a value to the 'subdir' parameter or" +\ " set the TWITTER environment variable." raise ValueError(msg) else: self.creds_subdir = subdir self.creds_fullpath =\ os.path.normpath(os.path.join(self.creds_subdir, self.creds_file)) if not os.path.isfile(self.creds_fullpath): raise OSError('Cannot find file {}'.format(self.creds_fullpath)) with open(self.creds_fullpath) as infile: if verbose: print('Reading credentials file {}'.format(self.creds_fullpath)) for line in infile: if '=' in line: name, value = line.split('=', 1) self.oauth[name.strip()] = value.strip() self._validate_creds_file(verbose=verbose) return self.oauth def _validate_creds_file(self, verbose=False): """Check validity of a credentials file.""" oauth1 = False oauth1_keys = ['app_key', 'app_secret', 'oauth_token', 'oauth_token_secret'] oauth2 = False oauth2_keys = ['app_key', 'app_secret', 'access_token'] if all(k in self.oauth for k in oauth1_keys): oauth1 = True elif all(k in self.oauth for k in oauth2_keys): oauth2 = True if not (oauth1 or oauth2): msg = 'Missing or incorrect entries in {}\n'.format(self.creds_file) msg += pprint.pformat(self.oauth) raise ValueError(msg) elif verbose: print('Credentials file "{}" looks good'.format(self.creds_file)) def add_access_token(creds_file=None): """ For OAuth 2, retrieve an access token for an app and append it to a credentials file. """ if creds_file is None: path = os.path.dirname(__file__) creds_file = os.path.join(path, 'credentials2.txt') oauth2 = credsfromfile(creds_file=creds_file) app_key = oauth2['app_key'] app_secret = oauth2['app_secret'] twitter = Twython(app_key, app_secret, oauth_version=2) access_token = twitter.obtain_access_token() tok = 'access_token={}\n'.format(access_token) with open(creds_file, 'a') as infile: print(tok, file=infile) def guess_path(pth): """ If the path is not absolute, guess that it is a subdirectory of the user's home directory. :param str pth: The pathname of the directory where files of tweets should be written """ if os.path.isabs(pth): return pth else: return os.path.expanduser(os.path.join("~", pth)) nltk-3.1/nltk/util.py0000644000076500000240000012102012607224144014351 0ustar sbstaff00000000000000# Natural Language Toolkit: Utility functions # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # URL: # For license information, see LICENSE.TXT from __future__ import print_function import locale import re import types import textwrap import pydoc import bisect import os from itertools import islice, chain, combinations from pprint import pprint from collections import defaultdict, deque from sys import version_info from nltk.internals import slice_bounds, raise_unorderable_types from nltk.compat import (class_types, text_type, string_types, total_ordering, python_2_unicode_compatible, getproxies, ProxyHandler, build_opener, install_opener, HTTPPasswordMgrWithDefaultRealm, ProxyBasicAuthHandler, ProxyDigestAuthHandler) ###################################################################### # Short usage message ###################################################################### def usage(obj, selfname='self'): import inspect str(obj) # In case it's lazy, this will load it. if not isinstance(obj, class_types): obj = obj.__class__ print('%s supports the following operations:' % obj.__name__) for (name, method) in sorted(pydoc.allmethods(obj).items()): if name.startswith('_'): continue if getattr(method, '__deprecated__', False): continue args, varargs, varkw, defaults = inspect.getargspec(method) if (args and args[0]=='self' and (defaults is None or len(args)>len(defaults))): args = args[1:] name = '%s.%s' % (selfname, name) argspec = inspect.formatargspec( args, varargs, varkw, defaults) print(textwrap.fill('%s%s' % (name, argspec), initial_indent=' - ', subsequent_indent=' '*(len(name)+5))) ########################################################################## # IDLE ########################################################################## def in_idle(): """ Return True if this function is run within idle. Tkinter programs that are run in idle should never call ``Tk.mainloop``; so this function should be used to gate all calls to ``Tk.mainloop``. :warning: This function works by checking ``sys.stdin``. If the user has modified ``sys.stdin``, then it may return incorrect results. :rtype: bool """ import sys return sys.stdin.__class__.__name__ in ('PyShell', 'RPCProxy') ########################################################################## # PRETTY PRINTING ########################################################################## def pr(data, start=0, end=None): """ Pretty print a sequence of data items :param data: the data stream to print :type data: sequence or iter :param start: the start position :type start: int :param end: the end position :type end: int """ pprint(list(islice(data, start, end))) def print_string(s, width=70): """ Pretty print a string, breaking lines on whitespace :param s: the string to print, consisting of words and spaces :type s: str :param width: the display width :type width: int """ print('\n'.join(textwrap.wrap(s, width=width))) def tokenwrap(tokens, separator=" ", width=70): """ Pretty print a list of text tokens, breaking lines on whitespace :param tokens: the tokens to print :type tokens: list :param separator: the string to use to separate tokens :type separator: str :param width: the display width (default=70) :type width: int """ return '\n'.join(textwrap.wrap(separator.join(tokens), width=width)) ########################################################################## # Python version ########################################################################## def py25(): return version_info[0] == 2 and version_info[1] == 5 def py26(): return version_info[0] == 2 and version_info[1] == 6 def py27(): return version_info[0] == 2 and version_info[1] == 7 ########################################################################## # Indexing ########################################################################## class Index(defaultdict): def __init__(self, pairs): defaultdict.__init__(self, list) for key, value in pairs: self[key].append(value) ###################################################################### ## Regexp display (thanks to David Mertz) ###################################################################### def re_show(regexp, string, left="{", right="}"): """ Return a string with markers surrounding the matched substrings. Search str for substrings matching ``regexp`` and wrap the matches with braces. This is convenient for learning about regular expressions. :param regexp: The regular expression. :type regexp: str :param string: The string being matched. :type string: str :param left: The left delimiter (printed before the matched substring) :type left: str :param right: The right delimiter (printed after the matched substring) :type right: str :rtype: str """ print(re.compile(regexp, re.M).sub(left + r"\g<0>" + right, string.rstrip())) ########################################################################## # READ FROM FILE OR STRING ########################################################################## # recipe from David Mertz def filestring(f): if hasattr(f, 'read'): return f.read() elif isinstance(f, string_types): with open(f, 'r') as infile: return infile.read() else: raise ValueError("Must be called with a filename or file-like object") ########################################################################## # Breadth-First Search ########################################################################## def breadth_first(tree, children=iter, maxdepth=-1): """Traverse the nodes of a tree in breadth-first order. (No need to check for cycles.) The first argument should be the tree root; children should be a function taking as argument a tree node and returning an iterator of the node's children. """ queue = deque([(tree, 0)]) while queue: node, depth = queue.popleft() yield node if depth != maxdepth: try: queue.extend((c, depth + 1) for c in children(node)) except TypeError: pass ########################################################################## # Guess Character Encoding ########################################################################## # adapted from io.py in the docutils extension module (http://docutils.sourceforge.net) # http://www.pyzine.com/Issue008/Section_Articles/article_Encodings.html def guess_encoding(data): """ Given a byte string, attempt to decode it. Tries the standard 'UTF8' and 'latin-1' encodings, Plus several gathered from locale information. The calling program *must* first call:: locale.setlocale(locale.LC_ALL, '') If successful it returns ``(decoded_unicode, successful_encoding)``. If unsuccessful it raises a ``UnicodeError``. """ successful_encoding = None # we make 'utf-8' the first encoding encodings = ['utf-8'] # # next we add anything we can learn from the locale try: encodings.append(locale.nl_langinfo(locale.CODESET)) except AttributeError: pass try: encodings.append(locale.getlocale()[1]) except (AttributeError, IndexError): pass try: encodings.append(locale.getdefaultlocale()[1]) except (AttributeError, IndexError): pass # # we try 'latin-1' last encodings.append('latin-1') for enc in encodings: # some of the locale calls # may have returned None if not enc: continue try: decoded = text_type(data, enc) successful_encoding = enc except (UnicodeError, LookupError): pass else: break if not successful_encoding: raise UnicodeError( 'Unable to decode input data. Tried the following encodings: %s.' % ', '.join([repr(enc) for enc in encodings if enc])) else: return (decoded, successful_encoding) ########################################################################## # Remove repeated elements from a list deterministcally ########################################################################## def unique_list(xs): seen = set() # not seen.add(x) here acts to make the code shorter without using if statements, seen.add(x) always returns None. return [x for x in xs if x not in seen and not seen.add(x)] ########################################################################## # Invert a dictionary ########################################################################## def invert_dict(d): inverted_dict = defaultdict(list) for key in d: if hasattr(d[key], '__iter__'): for term in d[key]: inverted_dict[term].append(key) else: inverted_dict[d[key]] = key return inverted_dict ########################################################################## # Utilities for directed graphs: transitive closure, and inversion # The graph is represented as a dictionary of sets ########################################################################## def transitive_closure(graph, reflexive=False): """ Calculate the transitive closure of a directed graph, optionally the reflexive transitive closure. The algorithm is a slight modification of the "Marking Algorithm" of Ioannidis & Ramakrishnan (1998) "Efficient Transitive Closure Algorithms". :param graph: the initial graph, represented as a dictionary of sets :type graph: dict(set) :param reflexive: if set, also make the closure reflexive :type reflexive: bool :rtype: dict(set) """ if reflexive: base_set = lambda k: set([k]) else: base_set = lambda k: set() # The graph U_i in the article: agenda_graph = dict((k, graph[k].copy()) for k in graph) # The graph M_i in the article: closure_graph = dict((k, base_set(k)) for k in graph) for i in graph: agenda = agenda_graph[i] closure = closure_graph[i] while agenda: j = agenda.pop() closure.add(j) closure |= closure_graph.setdefault(j, base_set(j)) agenda |= agenda_graph.get(j, base_set(j)) agenda -= closure return closure_graph def invert_graph(graph): """ Inverts a directed graph. :param graph: the graph, represented as a dictionary of sets :type graph: dict(set) :return: the inverted graph :rtype: dict(set) """ inverted = {} for key in graph: for value in graph[key]: inverted.setdefault(value, set()).add(key) return inverted ########################################################################## # HTML Cleaning ########################################################################## def clean_html(html): raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function") def clean_url(url): raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function") ########################################################################## # FLATTEN LISTS ########################################################################## def flatten(*args): """ Flatten a list. >>> from nltk.util import flatten >>> flatten(1, 2, ['b', 'a' , ['c', 'd']], 3) [1, 2, 'b', 'a', 'c', 'd', 3] :param args: items and lists to be combined into a single list :rtype: list """ x = [] for l in args: if not isinstance(l, (list, tuple)): l = [l] for item in l: if isinstance(item, (list, tuple)): x.extend(flatten(item)) else: x.append(item) return x ########################################################################## # Ngram iteration ########################################################################## # add a flag to pad the sequence so we get peripheral ngrams? def ngrams(sequence, n, pad_left=False, pad_right=False, pad_symbol=None): """ Return the ngrams generated from a sequence of items, as an iterator. For example: >>> from nltk.util import ngrams >>> list(ngrams([1,2,3,4,5], 3)) [(1, 2, 3), (2, 3, 4), (3, 4, 5)] Use ngrams for a list version of this function. Set pad_left or pad_right to true in order to get additional ngrams: >>> list(ngrams([1,2,3,4,5], 2, pad_right=True)) [(1, 2), (2, 3), (3, 4), (4, 5), (5, None)] :param sequence: the source data to be converted into ngrams :type sequence: sequence or iter :param n: the degree of the ngrams :type n: int :param pad_left: whether the ngrams should be left-padded :type pad_left: bool :param pad_right: whether the ngrams should be right-padded :type pad_right: bool :param pad_symbol: the symbol to use for padding (default is None) :type pad_symbol: any :rtype: iter(tuple) """ sequence = iter(sequence) if pad_left: sequence = chain((pad_symbol,) * (n-1), sequence) if pad_right: sequence = chain(sequence, (pad_symbol,) * (n-1)) history = [] while n > 1: history.append(next(sequence)) n -= 1 for item in sequence: history.append(item) yield tuple(history) del history[0] def bigrams(sequence, **kwargs): """ Return the bigrams generated from a sequence of items, as an iterator. For example: >>> from nltk.util import bigrams >>> list(bigrams([1,2,3,4,5])) [(1, 2), (2, 3), (3, 4), (4, 5)] Use bigrams for a list version of this function. :param sequence: the source data to be converted into bigrams :type sequence: sequence or iter :rtype: iter(tuple) """ for item in ngrams(sequence, 2, **kwargs): yield item def trigrams(sequence, **kwargs): """ Return the trigrams generated from a sequence of items, as an iterator. For example: >>> from nltk.util import trigrams >>> list(trigrams([1,2,3,4,5])) [(1, 2, 3), (2, 3, 4), (3, 4, 5)] Use trigrams for a list version of this function. :param sequence: the source data to be converted into trigrams :type sequence: sequence or iter :rtype: iter(tuple) """ for item in ngrams(sequence, 3, **kwargs): yield item def everygrams(sequence, min_len=1, max_len=-1): """ Returns all possible ngrams generated from a sequence of items, as an iterator. >>> sent = 'a b c'.split() >>> list(everygrams(sent)) [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c'), ('a', 'b', 'c')] >>> list(everygrams(sent, max_len=2)) [('a',), ('b',), ('c',), ('a', 'b'), ('b', 'c')] :param sequence: the source data to be converted into trigrams :type sequence: sequence or iter :param min_len: minimum length of the ngrams, aka. n-gram order/degree of ngram :type min_len: int :param max_len: maximum length of the ngrams (set to length of sequence by default) :type max_len: int :rtype: iter(tuple) """ if max_len == -1: max_len = len(sequence) for n in range(min_len, max_len+1): for ng in ngrams(sequence, n): yield ng def skipgrams(sequence, n, k): """ Returns all possible skipgrams generated from a sequence of items, as an iterator. Skipgrams are ngrams that allows tokens to be skipped. Refer to http://homepages.inf.ed.ac.uk/ballison/pdf/lrec_skipgrams.pdf >>> sent = "Insurgents killed in ongoing fighting".split() >>> list(skipgrams(sent, 2, 2)) [('Insurgents', 'killed'), ('Insurgents', 'in'), ('Insurgents', 'ongoing'), ('killed', 'in'), ('killed', 'ongoing'), ('killed', 'fighting'), ('in', 'ongoing'), ('in', 'fighting'), ('ongoing', 'fighting')] >>> list(skipgrams(sent, 3, 2)) [('Insurgents', 'killed', 'in'), ('Insurgents', 'killed', 'ongoing'), ('Insurgents', 'killed', 'fighting'), ('Insurgents', 'in', 'ongoing'), ('Insurgents', 'in', 'fighting'), ('Insurgents', 'ongoing', 'fighting'), ('killed', 'in', 'ongoing'), ('killed', 'in', 'fighting'), ('killed', 'ongoing', 'fighting'), ('in', 'ongoing', 'fighting')] :param sequence: the source data to be converted into trigrams :type sequence: sequence or iter :param n: the degree of the ngrams :type n: int :param k: the skip distance :type k: int :rtype: iter(tuple) """ for ngram in ngrams(sequence, n + k, pad_right=True): head = ngram[:1] tail = ngram[1:] for skip_tail in combinations(tail, n - 1): if skip_tail[-1] is None: continue yield head + skip_tail ########################################################################## # Ordered Dictionary ########################################################################## class OrderedDict(dict): def __init__(self, data=None, **kwargs): self._keys = self.keys(data, kwargs.get('keys')) self._default_factory = kwargs.get('default_factory') if data is None: dict.__init__(self) else: dict.__init__(self, data) def __delitem__(self, key): dict.__delitem__(self, key) self._keys.remove(key) def __getitem__(self, key): try: return dict.__getitem__(self, key) except KeyError: return self.__missing__(key) def __iter__(self): return (key for key in self.keys()) def __missing__(self, key): if not self._default_factory and key not in self._keys: raise KeyError() return self._default_factory() def __setitem__(self, key, item): dict.__setitem__(self, key, item) if key not in self._keys: self._keys.append(key) def clear(self): dict.clear(self) self._keys.clear() def copy(self): d = dict.copy(self) d._keys = self._keys return d def items(self): # returns iterator under python 3 and list under python 2 return zip(self.keys(), self.values()) def keys(self, data=None, keys=None): if data: if keys: assert isinstance(keys, list) assert len(data) == len(keys) return keys else: assert isinstance(data, dict) or \ isinstance(data, OrderedDict) or \ isinstance(data, list) if isinstance(data, dict) or isinstance(data, OrderedDict): return data.keys() elif isinstance(data, list): return [key for (key, value) in data] elif '_keys' in self.__dict__: return self._keys else: return [] def popitem(self): if not self._keys: raise KeyError() key = self._keys.pop() value = self[key] del self[key] return (key, value) def setdefault(self, key, failobj=None): dict.setdefault(self, key, failobj) if key not in self._keys: self._keys.append(key) def update(self, data): dict.update(self, data) for key in self.keys(data): if key not in self._keys: self._keys.append(key) def values(self): # returns iterator under python 3 return map(self.get, self._keys) ###################################################################### # Lazy Sequences ###################################################################### @total_ordering @python_2_unicode_compatible class AbstractLazySequence(object): """ An abstract base class for read-only sequences whose values are computed as needed. Lazy sequences act like tuples -- they can be indexed, sliced, and iterated over; but they may not be modified. The most common application of lazy sequences in NLTK is for corpus view objects, which provide access to the contents of a corpus without loading the entire corpus into memory, by loading pieces of the corpus from disk as needed. The result of modifying a mutable element of a lazy sequence is undefined. In particular, the modifications made to the element may or may not persist, depending on whether and when the lazy sequence caches that element's value or reconstructs it from scratch. Subclasses are required to define two methods: ``__len__()`` and ``iterate_from()``. """ def __len__(self): """ Return the number of tokens in the corpus file underlying this corpus view. """ raise NotImplementedError('should be implemented by subclass') def iterate_from(self, start): """ Return an iterator that generates the tokens in the corpus file underlying this corpus view, starting at the token number ``start``. If ``start>=len(self)``, then this iterator will generate no tokens. """ raise NotImplementedError('should be implemented by subclass') def __getitem__(self, i): """ Return the *i* th token in the corpus file underlying this corpus view. Negative indices and spans are both supported. """ if isinstance(i, slice): start, stop = slice_bounds(self, i) return LazySubsequence(self, start, stop) else: # Handle negative indices if i < 0: i += len(self) if i < 0: raise IndexError('index out of range') # Use iterate_from to extract it. try: return next(self.iterate_from(i)) except StopIteration: raise IndexError('index out of range') def __iter__(self): """Return an iterator that generates the tokens in the corpus file underlying this corpus view.""" return self.iterate_from(0) def count(self, value): """Return the number of times this list contains ``value``.""" return sum(1 for elt in self if elt==value) def index(self, value, start=None, stop=None): """Return the index of the first occurrence of ``value`` in this list that is greater than or equal to ``start`` and less than ``stop``. Negative start and stop values are treated like negative slice bounds -- i.e., they count from the end of the list.""" start, stop = slice_bounds(self, slice(start, stop)) for i, elt in enumerate(islice(self, start, stop)): if elt == value: return i+start raise ValueError('index(x): x not in list') def __contains__(self, value): """Return true if this list contains ``value``.""" return bool(self.count(value)) def __add__(self, other): """Return a list concatenating self with other.""" return LazyConcatenation([self, other]) def __radd__(self, other): """Return a list concatenating other with self.""" return LazyConcatenation([other, self]) def __mul__(self, count): """Return a list concatenating self with itself ``count`` times.""" return LazyConcatenation([self] * count) def __rmul__(self, count): """Return a list concatenating self with itself ``count`` times.""" return LazyConcatenation([self] * count) _MAX_REPR_SIZE = 60 def __repr__(self): """ Return a string representation for this corpus view that is similar to a list's representation; but if it would be more than 60 characters long, it is truncated. """ pieces = [] length = 5 for elt in self: pieces.append(repr(elt)) length += len(pieces[-1]) + 2 if length > self._MAX_REPR_SIZE and len(pieces) > 2: return '[%s, ...]' % text_type(', ').join(pieces[:-1]) else: return '[%s]' % text_type(', ').join(pieces) def __eq__(self, other): return (type(self) == type(other) and list(self) == list(other)) def __ne__(self, other): return not self == other def __lt__(self, other): if type(other) != type(self): raise_unorderable_types("<", self, other) return list(self) < list(other) def __hash__(self): """ :raise ValueError: Corpus view objects are unhashable. """ raise ValueError('%s objects are unhashable' % self.__class__.__name__) class LazySubsequence(AbstractLazySequence): """ A subsequence produced by slicing a lazy sequence. This slice keeps a reference to its source sequence, and generates its values by looking them up in the source sequence. """ MIN_SIZE = 100 """ The minimum size for which lazy slices should be created. If ``LazySubsequence()`` is called with a subsequence that is shorter than ``MIN_SIZE``, then a tuple will be returned instead. """ def __new__(cls, source, start, stop): """ Construct a new slice from a given underlying sequence. The ``start`` and ``stop`` indices should be absolute indices -- i.e., they should not be negative (for indexing from the back of a list) or greater than the length of ``source``. """ # If the slice is small enough, just use a tuple. if stop-start < cls.MIN_SIZE: return list(islice(source.iterate_from(start), stop-start)) else: return object.__new__(cls) def __init__(self, source, start, stop): self._source = source self._start = start self._stop = stop def __len__(self): return self._stop - self._start def iterate_from(self, start): return islice(self._source.iterate_from(start+self._start), max(0, len(self)-start)) class LazyConcatenation(AbstractLazySequence): """ A lazy sequence formed by concatenating a list of lists. This underlying list of lists may itself be lazy. ``LazyConcatenation`` maintains an index that it uses to keep track of the relationship between offsets in the concatenated lists and offsets in the sublists. """ def __init__(self, list_of_lists): self._list = list_of_lists self._offsets = [0] def __len__(self): if len(self._offsets) <= len(self._list): for tok in self.iterate_from(self._offsets[-1]): pass return self._offsets[-1] def iterate_from(self, start_index): if start_index < self._offsets[-1]: sublist_index = bisect.bisect_right(self._offsets, start_index)-1 else: sublist_index = len(self._offsets)-1 index = self._offsets[sublist_index] # Construct an iterator over the sublists. if isinstance(self._list, AbstractLazySequence): sublist_iter = self._list.iterate_from(sublist_index) else: sublist_iter = islice(self._list, sublist_index, None) for sublist in sublist_iter: if sublist_index == (len(self._offsets)-1): assert index+len(sublist) >= self._offsets[-1], ( 'offests not monotonic increasing!') self._offsets.append(index+len(sublist)) else: assert self._offsets[sublist_index+1] == index+len(sublist), ( 'inconsistent list value (num elts)') for value in sublist[max(0, start_index-index):]: yield value index += len(sublist) sublist_index += 1 class LazyMap(AbstractLazySequence): """ A lazy sequence whose elements are formed by applying a given function to each element in one or more underlying lists. The function is applied lazily -- i.e., when you read a value from the list, ``LazyMap`` will calculate that value by applying its function to the underlying lists' value(s). ``LazyMap`` is essentially a lazy version of the Python primitive function ``map``. In particular, the following two expressions are equivalent: >>> from nltk.util import LazyMap >>> function = str >>> sequence = [1,2,3] >>> map(function, sequence) # doctest: +SKIP ['1', '2', '3'] >>> list(LazyMap(function, sequence)) ['1', '2', '3'] Like the Python ``map`` primitive, if the source lists do not have equal size, then the value None will be supplied for the 'missing' elements. Lazy maps can be useful for conserving memory, in cases where individual values take up a lot of space. This is especially true if the underlying list's values are constructed lazily, as is the case with many corpus readers. A typical example of a use case for this class is performing feature detection on the tokens in a corpus. Since featuresets are encoded as dictionaries, which can take up a lot of memory, using a ``LazyMap`` can significantly reduce memory usage when training and running classifiers. """ def __init__(self, function, *lists, **config): """ :param function: The function that should be applied to elements of ``lists``. It should take as many arguments as there are ``lists``. :param lists: The underlying lists. :param cache_size: Determines the size of the cache used by this lazy map. (default=5) """ if not lists: raise TypeError('LazyMap requires at least two args') self._lists = lists self._func = function self._cache_size = config.get('cache_size', 5) self._cache = ({} if self._cache_size > 0 else None) # If you just take bool() of sum() here _all_lazy will be true just # in case n >= 1 list is an AbstractLazySequence. Presumably this # isn't what's intended. self._all_lazy = sum(isinstance(lst, AbstractLazySequence) for lst in lists) == len(lists) def iterate_from(self, index): # Special case: one lazy sublist if len(self._lists) == 1 and self._all_lazy: for value in self._lists[0].iterate_from(index): yield self._func(value) return # Special case: one non-lazy sublist elif len(self._lists) == 1: while True: try: yield self._func(self._lists[0][index]) except IndexError: return index += 1 # Special case: n lazy sublists elif self._all_lazy: iterators = [lst.iterate_from(index) for lst in self._lists] while True: elements = [] for iterator in iterators: try: elements.append(next(iterator)) except: elements.append(None) if elements == [None] * len(self._lists): return yield self._func(*elements) index += 1 # general case else: while True: try: elements = [lst[index] for lst in self._lists] except IndexError: elements = [None] * len(self._lists) for i, lst in enumerate(self._lists): try: elements[i] = lst[index] except IndexError: pass if elements == [None] * len(self._lists): return yield self._func(*elements) index += 1 def __getitem__(self, index): if isinstance(index, slice): sliced_lists = [lst[index] for lst in self._lists] return LazyMap(self._func, *sliced_lists) else: # Handle negative indices if index < 0: index += len(self) if index < 0: raise IndexError('index out of range') # Check the cache if self._cache is not None and index in self._cache: return self._cache[index] # Calculate the value try: val = next(self.iterate_from(index)) except StopIteration: raise IndexError('index out of range') # Update the cache if self._cache is not None: if len(self._cache) > self._cache_size: self._cache.popitem() # discard random entry self._cache[index] = val # Return the value return val def __len__(self): return max(len(lst) for lst in self._lists) class LazyZip(LazyMap): """ A lazy sequence whose elements are tuples, each containing the i-th element from each of the argument sequences. The returned list is truncated in length to the length of the shortest argument sequence. The tuples are constructed lazily -- i.e., when you read a value from the list, ``LazyZip`` will calculate that value by forming a tuple from the i-th element of each of the argument sequences. ``LazyZip`` is essentially a lazy version of the Python primitive function ``zip``. In particular, an evaluated LazyZip is equivalent to a zip: >>> from nltk.util import LazyZip >>> sequence1, sequence2 = [1, 2, 3], ['a', 'b', 'c'] >>> zip(sequence1, sequence2) # doctest: +SKIP [(1, 'a'), (2, 'b'), (3, 'c')] >>> list(LazyZip(sequence1, sequence2)) [(1, 'a'), (2, 'b'), (3, 'c')] >>> sequences = [sequence1, sequence2, [6,7,8,9]] >>> list(zip(*sequences)) == list(LazyZip(*sequences)) True Lazy zips can be useful for conserving memory in cases where the argument sequences are particularly long. A typical example of a use case for this class is combining long sequences of gold standard and predicted values in a classification or tagging task in order to calculate accuracy. By constructing tuples lazily and avoiding the creation of an additional long sequence, memory usage can be significantly reduced. """ def __init__(self, *lists): """ :param lists: the underlying lists :type lists: list(list) """ LazyMap.__init__(self, lambda *elts: elts, *lists) def iterate_from(self, index): iterator = LazyMap.iterate_from(self, index) while index < len(self): yield next(iterator) index += 1 return def __len__(self): return min(len(lst) for lst in self._lists) class LazyEnumerate(LazyZip): """ A lazy sequence whose elements are tuples, each ontaining a count (from zero) and a value yielded by underlying sequence. ``LazyEnumerate`` is useful for obtaining an indexed list. The tuples are constructed lazily -- i.e., when you read a value from the list, ``LazyEnumerate`` will calculate that value by forming a tuple from the count of the i-th element and the i-th element of the underlying sequence. ``LazyEnumerate`` is essentially a lazy version of the Python primitive function ``enumerate``. In particular, the following two expressions are equivalent: >>> from nltk.util import LazyEnumerate >>> sequence = ['first', 'second', 'third'] >>> list(enumerate(sequence)) [(0, 'first'), (1, 'second'), (2, 'third')] >>> list(LazyEnumerate(sequence)) [(0, 'first'), (1, 'second'), (2, 'third')] Lazy enumerations can be useful for conserving memory in cases where the argument sequences are particularly long. A typical example of a use case for this class is obtaining an indexed list for a long sequence of values. By constructing tuples lazily and avoiding the creation of an additional long sequence, memory usage can be significantly reduced. """ def __init__(self, lst): """ :param lst: the underlying list :type lst: list """ LazyZip.__init__(self, range(len(lst)), lst) ###################################################################### # Binary Search in a File ###################################################################### # inherited from pywordnet, by Oliver Steele def binary_search_file(file, key, cache={}, cacheDepth=-1): """ Return the line from the file with first word key. Searches through a sorted file using the binary search algorithm. :type file: file :param file: the file to be searched through. :type key: str :param key: the identifier we are searching for. """ key = key + ' ' keylen = len(key) start = 0 currentDepth = 0 if hasattr(file, 'name'): end = os.stat(file.name).st_size - 1 else: file.seek(0, 2) end = file.tell() - 1 file.seek(0) while start < end: lastState = start, end middle = (start + end) // 2 if cache.get(middle): offset, line = cache[middle] else: line = "" while True: file.seek(max(0, middle - 1)) if middle > 0: file.readline() offset = file.tell() line = file.readline() if line != "": break # at EOF; try to find start of the last line middle = (start + middle)//2 if middle == end -1: return None if currentDepth < cacheDepth: cache[middle] = (offset, line) if offset > end: assert end != middle - 1, "infinite loop" end = middle - 1 elif line[:keylen] == key: return line elif line > key: assert end != middle - 1, "infinite loop" end = middle - 1 elif line < key: start = offset + len(line) - 1 currentDepth += 1 thisState = start, end if lastState == thisState: # Detects the condition where we're searching past the end # of the file, which is otherwise difficult to detect return None return None ###################################################################### # Proxy configuration ###################################################################### def set_proxy(proxy, user=None, password=''): """ Set the HTTP proxy for Python to download through. If ``proxy`` is None then tries to set proxy from environment or system settings. :param proxy: The HTTP proxy server to use. For example: 'http://proxy.example.com:3128/' :param user: The username to authenticate with. Use None to disable authentication. :param password: The password to authenticate with. """ from nltk import compat if proxy is None: # Try and find the system proxy settings try: proxy = getproxies()['http'] except KeyError: raise ValueError('Could not detect default proxy settings') # Set up the proxy handler proxy_handler = ProxyHandler({'http': proxy}) opener = build_opener(proxy_handler) if user is not None: # Set up basic proxy authentication if provided password_manager = HTTPPasswordMgrWithDefaultRealm() password_manager.add_password(realm=None, uri=proxy, user=user, passwd=password) opener.add_handler(ProxyBasicAuthHandler(password_manager)) opener.add_handler(ProxyDigestAuthHandler(password_manager)) # Overide the existing url opener install_opener(opener) ###################################################################### # ElementTree pretty printing from http://www.effbot.org/zone/element-lib.htm ###################################################################### def elementtree_indent(elem, level=0): """ Recursive function to indent an ElementTree._ElementInterface used for pretty printing. Run indent on elem and then output in the normal way. :param elem: element to be indented. will be modified. :type elem: ElementTree._ElementInterface :param level: level of indentation for this element :type level: nonnegative integer :rtype: ElementTree._ElementInterface :return: Contents of elem indented to reflect its structure """ i = "\n" + level*" " if len(elem): if not elem.text or not elem.text.strip(): elem.text = i + " " for elem in elem: elementtree_indent(elem, level+1) if not elem.tail or not elem.tail.strip(): elem.tail = i else: if level and (not elem.tail or not elem.tail.strip()): elem.tail = i nltk-3.1/nltk/VERSION0000644000076500000240000000000412607523371014074 0ustar sbstaff000000000000003.1 nltk-3.1/nltk/wsd.py0000644000076500000240000000330112607224144014172 0ustar sbstaff00000000000000# Natural Language Toolkit: Word Sense Disambiguation Algorithms # # Authors: Liling Tan , # Dmitrijs Milajevs # # Copyright (C) 2001-2015 NLTK Project # URL: # For license information, see LICENSE.TXT from nltk.corpus import wordnet def lesk(context_sentence, ambiguous_word, pos=None, synsets=None): """Return a synset for an ambiguous word in a context. :param iter context_sentence: The context sentence where the ambiguous word occurs, passed as an iterable of words. :param str ambiguous_word: The ambiguous word that requires WSD. :param str pos: A specified Part-of-Speech (POS). :param iter synsets: Possible synsets of the ambiguous word. :return: ``lesk_sense`` The Synset() object with the highest signature overlaps. This function is an implementation of the original Lesk algorithm (1986) [1]. Usage example:: >>> lesk(['I', 'went', 'to', 'the', 'bank', 'to', 'deposit', 'money', '.'], 'bank', 'n') Synset('savings_bank.n.02') [1] Lesk, Michael. "Automatic sense disambiguation using machine readable dictionaries: how to tell a pine cone from an ice cream cone." Proceedings of the 5th Annual International Conference on Systems Documentation. ACM, 1986. http://dl.acm.org/citation.cfm?id=318728 """ context = set(context_sentence) if synsets is None: synsets = wordnet.synsets(ambiguous_word) if pos: synsets = [ss for ss in synsets if str(ss.pos()) == pos] if not synsets: return None _, sense = max( (len(context.intersection(ss.definition().split())), ss) for ss in synsets ) return sense nltk-3.1/nltk.egg-info/0000755000076500000240000000000012610001541014504 5ustar sbstaff00000000000000nltk-3.1/nltk.egg-info/dependency_links.txt0000644000076500000240000000000112610001535020555 0ustar sbstaff00000000000000 nltk-3.1/nltk.egg-info/not-zip-safe0000644000076500000240000000000112610001526016735 0ustar sbstaff00000000000000 nltk-3.1/nltk.egg-info/PKG-INFO0000644000076500000240000000304212610001535015603 0ustar sbstaff00000000000000Metadata-Version: 1.1 Name: nltk Version: 3.1 Summary: Natural Language Toolkit Home-page: http://nltk.org/ Author: Steven Bird Author-email: stevenbird1@gmail.com License: Apache License, Version 2.0 Description: The Natural Language Toolkit (NLTK) is a Python package for natural language processing. NLTK requires Python 2.7, or 3.2+. Keywords: NLP,CL,natural language processing,computational linguistics,parsing,tagging,tokenizing,syntax,linguistics,language,natural language,text analytics Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Education Classifier: Intended Audience :: Information Technology Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: Apache Software License Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3.2 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Topic :: Scientific/Engineering Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence Classifier: Topic :: Scientific/Engineering :: Human Machine Interfaces Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Text Processing Classifier: Topic :: Text Processing :: Filters Classifier: Topic :: Text Processing :: General Classifier: Topic :: Text Processing :: Indexing Classifier: Topic :: Text Processing :: Linguistic nltk-3.1/nltk.egg-info/SOURCES.txt0000644000076500000240000002163112610001541016373 0ustar sbstaff00000000000000INSTALL.txt LICENSE.txt MANIFEST.in setup.cfg setup.py nltk/VERSION nltk/__init__.py nltk/book.py nltk/collocations.py nltk/compat.py nltk/data.py nltk/decorators.py nltk/downloader.py nltk/featstruct.py nltk/grammar.py nltk/help.py nltk/internals.py nltk/jsontags.py nltk/lazyimport.py nltk/probability.py nltk/six.py nltk/text.py nltk/tgrep.py nltk/toolbox.py nltk/tree.py nltk/treeprettyprinter.py nltk/treetransforms.py nltk/util.py nltk/wsd.py nltk.egg-info/PKG-INFO nltk.egg-info/SOURCES.txt nltk.egg-info/dependency_links.txt nltk.egg-info/not-zip-safe nltk.egg-info/top_level.txt nltk/app/__init__.py nltk/app/chartparser_app.py nltk/app/chunkparser_app.py nltk/app/collocations_app.py nltk/app/concordance_app.py nltk/app/nemo_app.py nltk/app/rdparser_app.py nltk/app/srparser_app.py nltk/app/wordfreq_app.py nltk/app/wordnet_app.py nltk/ccg/__init__.py nltk/ccg/api.py nltk/ccg/chart.py nltk/ccg/combinator.py nltk/ccg/lexicon.py nltk/chat/__init__.py nltk/chat/eliza.py nltk/chat/iesha.py nltk/chat/rude.py nltk/chat/suntsu.py nltk/chat/util.py nltk/chat/zen.py nltk/chunk/__init__.py nltk/chunk/api.py nltk/chunk/named_entity.py nltk/chunk/regexp.py nltk/chunk/util.py nltk/classify/__init__.py nltk/classify/api.py nltk/classify/decisiontree.py nltk/classify/maxent.py nltk/classify/megam.py nltk/classify/naivebayes.py nltk/classify/positivenaivebayes.py nltk/classify/rte_classify.py nltk/classify/scikitlearn.py nltk/classify/senna.py nltk/classify/svm.py nltk/classify/tadm.py nltk/classify/textcat.py nltk/classify/util.py nltk/classify/weka.py nltk/cluster/__init__.py nltk/cluster/api.py nltk/cluster/em.py nltk/cluster/gaac.py nltk/cluster/kmeans.py nltk/cluster/util.py nltk/corpus/__init__.py nltk/corpus/europarl_raw.py nltk/corpus/util.py nltk/corpus/reader/__init__.py nltk/corpus/reader/aligned.py nltk/corpus/reader/api.py nltk/corpus/reader/bnc.py nltk/corpus/reader/bracket_parse.py nltk/corpus/reader/categorized_sents.py nltk/corpus/reader/chasen.py nltk/corpus/reader/childes.py nltk/corpus/reader/chunked.py nltk/corpus/reader/cmudict.py nltk/corpus/reader/comparative_sents.py nltk/corpus/reader/conll.py nltk/corpus/reader/crubadan.py nltk/corpus/reader/dependency.py nltk/corpus/reader/framenet.py nltk/corpus/reader/ieer.py nltk/corpus/reader/indian.py nltk/corpus/reader/ipipan.py nltk/corpus/reader/knbc.py nltk/corpus/reader/lin.py nltk/corpus/reader/mte.py nltk/corpus/reader/nkjp.py nltk/corpus/reader/nombank.py nltk/corpus/reader/nps_chat.py nltk/corpus/reader/opinion_lexicon.py nltk/corpus/reader/pl196x.py nltk/corpus/reader/plaintext.py nltk/corpus/reader/ppattach.py nltk/corpus/reader/propbank.py nltk/corpus/reader/pros_cons.py nltk/corpus/reader/reviews.py nltk/corpus/reader/rte.py nltk/corpus/reader/semcor.py nltk/corpus/reader/senseval.py nltk/corpus/reader/sentiwordnet.py nltk/corpus/reader/sinica_treebank.py nltk/corpus/reader/string_category.py nltk/corpus/reader/switchboard.py nltk/corpus/reader/tagged.py nltk/corpus/reader/timit.py nltk/corpus/reader/toolbox.py nltk/corpus/reader/twitter.py nltk/corpus/reader/udhr.py nltk/corpus/reader/util.py nltk/corpus/reader/verbnet.py nltk/corpus/reader/wordlist.py nltk/corpus/reader/wordnet.py nltk/corpus/reader/xmldocs.py nltk/corpus/reader/ycoe.py nltk/draw/__init__.py nltk/draw/cfg.py nltk/draw/dispersion.py nltk/draw/table.py nltk/draw/tree.py nltk/draw/util.py nltk/inference/__init__.py nltk/inference/api.py nltk/inference/discourse.py nltk/inference/mace.py nltk/inference/nonmonotonic.py nltk/inference/prover9.py nltk/inference/resolution.py nltk/inference/tableau.py nltk/metrics/__init__.py nltk/metrics/agreement.py nltk/metrics/association.py nltk/metrics/confusionmatrix.py nltk/metrics/distance.py nltk/metrics/paice.py nltk/metrics/scores.py nltk/metrics/segmentation.py nltk/metrics/spearman.py nltk/misc/__init__.py nltk/misc/babelfish.py nltk/misc/chomsky.py nltk/misc/minimalset.py nltk/misc/sort.py nltk/misc/wordfinder.py nltk/parse/__init__.py nltk/parse/api.py nltk/parse/bllip.py nltk/parse/chart.py nltk/parse/dependencygraph.py nltk/parse/earleychart.py nltk/parse/evaluate.py nltk/parse/featurechart.py nltk/parse/generate.py nltk/parse/malt.py nltk/parse/nonprojectivedependencyparser.py nltk/parse/pchart.py nltk/parse/projectivedependencyparser.py nltk/parse/recursivedescent.py nltk/parse/shiftreduce.py nltk/parse/stanford.py nltk/parse/transitionparser.py nltk/parse/util.py nltk/parse/viterbi.py nltk/sem/__init__.py nltk/sem/boxer.py nltk/sem/chat80.py nltk/sem/cooper_storage.py nltk/sem/drt.py nltk/sem/drt_glue_demo.py nltk/sem/evaluate.py nltk/sem/glue.py nltk/sem/hole.py nltk/sem/lfg.py nltk/sem/linearlogic.py nltk/sem/logic.py nltk/sem/relextract.py nltk/sem/skolemize.py nltk/sem/util.py nltk/sentiment/__init__.py nltk/sentiment/sentiment_analyzer.py nltk/sentiment/util.py nltk/sentiment/vader.py nltk/stem/__init__.py nltk/stem/api.py nltk/stem/isri.py nltk/stem/lancaster.py nltk/stem/porter.py nltk/stem/regexp.py nltk/stem/rslp.py nltk/stem/snowball.py nltk/stem/util.py nltk/stem/wordnet.py nltk/tag/__init__.py nltk/tag/api.py nltk/tag/brill.py nltk/tag/brill_trainer.py nltk/tag/crf.py nltk/tag/hmm.py nltk/tag/hunpos.py nltk/tag/mapping.py nltk/tag/perceptron.py nltk/tag/senna.py nltk/tag/sequential.py nltk/tag/stanford.py nltk/tag/tnt.py nltk/tag/util.py nltk/tbl/__init__.py nltk/tbl/api.py nltk/tbl/demo.py nltk/tbl/erroranalysis.py nltk/tbl/feature.py nltk/tbl/rule.py nltk/tbl/template.py nltk/test/__init__.py nltk/test/all.py nltk/test/bleu.doctest nltk/test/bnc.doctest nltk/test/ccg.doctest nltk/test/chat80.doctest nltk/test/childes.doctest nltk/test/childes_fixt.py nltk/test/chunk.doctest nltk/test/classify.doctest nltk/test/classify_fixt.py nltk/test/collocations.doctest nltk/test/compat.doctest nltk/test/compat_fixt.py nltk/test/corpus.doctest nltk/test/corpus_fixt.py nltk/test/crubadan.doctest nltk/test/data.doctest nltk/test/dependency.doctest nltk/test/discourse.doctest nltk/test/discourse_fixt.py nltk/test/doctest_nose_plugin.py nltk/test/drt.doctest nltk/test/featgram.doctest nltk/test/featstruct.doctest nltk/test/framenet.doctest nltk/test/generate.doctest nltk/test/gensim.doctest nltk/test/gensim_fixt.py nltk/test/gluesemantics.doctest nltk/test/gluesemantics_malt.doctest nltk/test/gluesemantics_malt_fixt.py nltk/test/grammar.doctest nltk/test/grammartestsuites.doctest nltk/test/index.doctest nltk/test/inference.doctest nltk/test/inference_fixt.py nltk/test/internals.doctest nltk/test/japanese.doctest nltk/test/logic.doctest nltk/test/metrics.doctest nltk/test/misc.doctest nltk/test/nonmonotonic.doctest nltk/test/nonmonotonic_fixt.py nltk/test/paice.doctest nltk/test/parse.doctest nltk/test/portuguese_en.doctest nltk/test/portuguese_en_fixt.py nltk/test/probability.doctest nltk/test/probability_fixt.py nltk/test/propbank.doctest nltk/test/relextract.doctest nltk/test/resolution.doctest nltk/test/runtests.py nltk/test/segmentation_fixt.py nltk/test/semantics.doctest nltk/test/semantics_fixt.py nltk/test/sentiment.doctest nltk/test/sentiwordnet.doctest nltk/test/simple.doctest nltk/test/stem.doctest nltk/test/tag.doctest nltk/test/tokenize.doctest nltk/test/toolbox.doctest nltk/test/translate.doctest nltk/test/translate_fixt.py nltk/test/tree.doctest nltk/test/treeprettyprinter.doctest nltk/test/treetransforms.doctest nltk/test/util.doctest nltk/test/wordnet.doctest nltk/test/wordnet_fixt.py nltk/test/wordnet_lch.doctest nltk/test/wsd.doctest nltk/test/unit/__init__.py nltk/test/unit/test_2x_compat.py nltk/test/unit/test_classify.py nltk/test/unit/test_collocations.py nltk/test/unit/test_corpora.py nltk/test/unit/test_corpus_views.py nltk/test/unit/test_hmm.py nltk/test/unit/test_json2csv_corpus.py nltk/test/unit/test_naivebayes.py nltk/test/unit/test_seekable_unicode_stream_reader.py nltk/test/unit/test_stem.py nltk/test/unit/test_tag.py nltk/test/unit/test_tgrep.py nltk/test/unit/test_twitter_auth.py nltk/test/unit/utils.py nltk/test/unit/translate/__init__.py nltk/test/unit/translate/test_bleu.py nltk/test/unit/translate/test_ibm1.py nltk/test/unit/translate/test_ibm2.py nltk/test/unit/translate/test_ibm3.py nltk/test/unit/translate/test_ibm4.py nltk/test/unit/translate/test_ibm5.py nltk/test/unit/translate/test_ibm_model.py nltk/test/unit/translate/test_stack_decoder.py nltk/tokenize/__init__.py nltk/tokenize/api.py nltk/tokenize/casual.py nltk/tokenize/mwe.py nltk/tokenize/punkt.py nltk/tokenize/regexp.py nltk/tokenize/sexpr.py nltk/tokenize/simple.py nltk/tokenize/stanford.py nltk/tokenize/texttiling.py nltk/tokenize/treebank.py nltk/tokenize/util.py nltk/translate/__init__.py nltk/translate/api.py nltk/translate/bleu_score.py nltk/translate/gale_church.py nltk/translate/gdfa.py nltk/translate/ibm1.py nltk/translate/ibm2.py nltk/translate/ibm3.py nltk/translate/ibm4.py nltk/translate/ibm5.py nltk/translate/ibm_model.py nltk/translate/metrics.py nltk/translate/phrase_based.py nltk/translate/stack_decoder.py nltk/twitter/__init__.py nltk/twitter/api.py nltk/twitter/common.py nltk/twitter/twitter_demo.py nltk/twitter/twitterclient.py nltk/twitter/util.pynltk-3.1/nltk.egg-info/top_level.txt0000644000076500000240000000000512610001535017234 0ustar sbstaff00000000000000nltk nltk-3.1/PKG-INFO0000644000076500000240000000304212610001541013136 0ustar sbstaff00000000000000Metadata-Version: 1.1 Name: nltk Version: 3.1 Summary: Natural Language Toolkit Home-page: http://nltk.org/ Author: Steven Bird Author-email: stevenbird1@gmail.com License: Apache License, Version 2.0 Description: The Natural Language Toolkit (NLTK) is a Python package for natural language processing. NLTK requires Python 2.7, or 3.2+. Keywords: NLP,CL,natural language processing,computational linguistics,parsing,tagging,tokenizing,syntax,linguistics,language,natural language,text analytics Platform: UNKNOWN Classifier: Development Status :: 5 - Production/Stable Classifier: Intended Audience :: Developers Classifier: Intended Audience :: Education Classifier: Intended Audience :: Information Technology Classifier: Intended Audience :: Science/Research Classifier: License :: OSI Approved :: Apache Software License Classifier: Operating System :: OS Independent Classifier: Programming Language :: Python :: 2.7 Classifier: Programming Language :: Python :: 3.2 Classifier: Programming Language :: Python :: 3.3 Classifier: Programming Language :: Python :: 3.4 Classifier: Topic :: Scientific/Engineering Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence Classifier: Topic :: Scientific/Engineering :: Human Machine Interfaces Classifier: Topic :: Scientific/Engineering :: Information Analysis Classifier: Topic :: Text Processing Classifier: Topic :: Text Processing :: Filters Classifier: Topic :: Text Processing :: General Classifier: Topic :: Text Processing :: Indexing Classifier: Topic :: Text Processing :: Linguistic nltk-3.1/setup.cfg0000644000076500000240000000007312610001541013663 0ustar sbstaff00000000000000[egg_info] tag_date = 0 tag_svn_revision = 0 tag_build = nltk-3.1/setup.py0000644000076500000240000000526412607224144013577 0ustar sbstaff00000000000000#!/usr/bin/env python # # Setup script for the Natural Language Toolkit # # Copyright (C) 2001-2015 NLTK Project # Author: Steven Bird # Edward Loper # Ewan Klein # URL: # For license information, see LICENSE.TXT # Work around mbcs bug in distutils. # http://bugs.python.org/issue10945 import codecs try: codecs.lookup('mbcs') except LookupError: ascii = codecs.lookup('ascii') func = lambda name, enc=ascii: {True: enc}.get(name=='mbcs') codecs.register(func) import os # Use the VERSION file to get NLTK version version_file = os.path.join(os.path.dirname(__file__), 'nltk', 'VERSION') with open(version_file) as fh: nltk_version = fh.read().strip() # setuptools from setuptools import setup, find_packages setup( name = "nltk", description = "Natural Language Toolkit", version = nltk_version, url = "http://nltk.org/", long_description = """\ The Natural Language Toolkit (NLTK) is a Python package for natural language processing. NLTK requires Python 2.7, or 3.2+.""", license = "Apache License, Version 2.0", keywords = ['NLP', 'CL', 'natural language processing', 'computational linguistics', 'parsing', 'tagging', 'tokenizing', 'syntax', 'linguistics', 'language', 'natural language', 'text analytics'], maintainer = "Steven Bird", maintainer_email = "stevenbird1@gmail.com", author = "Steven Bird", author_email = "stevenbird1@gmail.com", classifiers = [ 'Development Status :: 5 - Production/Stable', 'Intended Audience :: Developers', 'Intended Audience :: Education', 'Intended Audience :: Information Technology', 'Intended Audience :: Science/Research', 'License :: OSI Approved :: Apache Software License', 'Operating System :: OS Independent', 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3.2', 'Programming Language :: Python :: 3.3', 'Programming Language :: Python :: 3.4', 'Topic :: Scientific/Engineering', 'Topic :: Scientific/Engineering :: Artificial Intelligence', 'Topic :: Scientific/Engineering :: Human Machine Interfaces', 'Topic :: Scientific/Engineering :: Information Analysis', 'Topic :: Text Processing', 'Topic :: Text Processing :: Filters', 'Topic :: Text Processing :: General', 'Topic :: Text Processing :: Indexing', 'Topic :: Text Processing :: Linguistic', ], package_data = {'nltk': ['test/*.doctest', 'VERSION']}, # install_requires = ['six>=1.9.0'], packages = find_packages(), zip_safe=False, # since normal files will be present too? )